From fbd24677963abedf88b6dbed277764e955374dd8 Mon Sep 17 00:00:00 2001 From: Aaditya <115080342+easyonaadit@users.noreply.github.com> Date: Wed, 18 Mar 2026 13:11:53 +0530 Subject: [PATCH] [AMDGPU] DPP implementations for Wave Reduction (#185814) Adding DPP reduction support for i32 types. Supported Ops: `umin`, `min`, `umax`, `max`, `add`, `sub`, `and`, `or`, `xor`. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 676 ++-- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 20 +- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 2 +- .../AMDGPU/amdgpu-cs-chain-fp-nosave.ll | 295 +- .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 3487 ++++++++++------- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll | 1189 +++++- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll | 994 ++++- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll | 1002 ++++- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll | 1002 ++++- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll | 994 ++++- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll | 1242 +++++- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 994 ++++- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 994 ++++- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll | 1118 +++++- llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll | 199 +- 16 files changed, 11320 insertions(+), 2889 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 2ba69ee089ba..2b1bd578ab45 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4655,7 +4655,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue WaveReduction = DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32); Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction, - Size, DAG.getConstant(0, dl, MVT::i32)); + Size, DAG.getTargetConstant(0, dl, MVT::i32)); SDValue ScaledSize = DAG.getNode( ISD::SHL, dl, VT, Size, DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); @@ -5639,6 +5639,32 @@ static bool isFloatingPointWaveReduceOperation(unsigned Opc) { Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64; } +static unsigned getDPPOpcForWaveReduction(unsigned Opc, + const GCNSubtarget &ST) { + switch (Opc) { + case AMDGPU::S_MIN_U32: + return AMDGPU::V_MIN_U32_dpp; + case AMDGPU::S_MIN_I32: + return AMDGPU::V_MIN_I32_dpp; + case AMDGPU::S_MAX_U32: + return AMDGPU::V_MAX_U32_dpp; + case AMDGPU::S_MAX_I32: + return AMDGPU::V_MAX_I32_dpp; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: + return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp + : AMDGPU::V_ADD_CO_U32_dpp; + case AMDGPU::S_AND_B32: + return AMDGPU::V_AND_B32_dpp; + case AMDGPU::S_OR_B32: + return AMDGPU::V_OR_B32_dpp; + case AMDGPU::S_XOR_B32: + return AMDGPU::V_XOR_B32_dpp; + default: + llvm_unreachable("unhandled lane op"); + } +} + static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, @@ -5652,6 +5678,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, Register SrcReg = MI.getOperand(1).getReg(); bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg)); Register DstReg = MI.getOperand(0).getReg(); + unsigned Stratergy = static_cast(MI.getOperand(2).getImm()); + enum WAVE_REDUCE_STRATEGY : unsigned { DEFAULT = 0, ITERATIVE = 1, DPP = 2 }; MachineBasicBlock *RetBB = nullptr; if (isSGPR) { switch (Opc) { @@ -5918,267 +5946,431 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, } } } else { - // TODO: Implement DPP Strategy and switch based on immediate strategy - // operand. For now, for all the cases (default, Iterative and DPP we use - // iterative approach by default.) - - // To reduce the VGPR using iterative approach, we need to iterate - // over all the active lanes. Lowering consists of ComputeLoop, - // which iterate over only active lanes. We use copy of EXEC register - // as induction variable and every active lane modifies it using bitset0 - // so that we will get the next active lane for next iteration. MachineBasicBlock::iterator I = BB.end(); Register SrcReg = MI.getOperand(1).getReg(); bool is32BitOpc = is32bitWaveReduceOperation(Opc); bool isFPOp = isFloatingPointWaveReduceOperation(Opc); - - // Create Control flow for loop - // Split MI's Machine Basic block into For loop - auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true); - // Create virtual registers required for lowering. const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); - Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass); - Register IdentityValReg = MRI.createVirtualRegister(DstRegClass); - Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass); - Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); - Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); - Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - Register LaneValueReg = MRI.createVirtualRegister(DstRegClass); - + const TargetRegisterClass *SrcRegClass = MRI.getRegClass(SrcReg); bool IsWave32 = ST.isWave32(); unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE || + !ST.hasDPP()) { // If target doesn't support DPP operations, default to + // iterative stratergy - // Create initial values of induction variable from Exec, Accumulator and - // insert branch instr to newly created ComputeBlock - BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg); - if (is32BitOpc) { - uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc); - BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg) - .addImm(IdentityValue); - } else { - uint64_t IdentityValue = - MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64 - ? 0x0 // +0.0 for double sub reduction - : getIdentityValueFor64BitWaveReduction(Opc); - BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg) - .addImm(IdentityValue); - } - // clang-format off - BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)) - .addMBB(ComputeLoop); - // clang-format on + // To reduce the VGPR using iterative approach, we need to iterate + // over all the active lanes. Lowering consists of ComputeLoop, + // which iterate over only active lanes. We use copy of EXEC register + // as induction variable and every active lane modifies it using bitset0 + // so that we will get the next active lane for next iteration. - // Start constructing ComputeLoop - I = ComputeLoop->begin(); - auto Accumulator = - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) - .addReg(IdentityValReg) - .addMBB(&BB); - auto ActiveBits = - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) - .addReg(LoopIterator) - .addMBB(&BB); + // Create Control flow for loop + // Split MI's Machine Basic block into For loop + auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true); - I = ComputeLoop->end(); - MachineInstr *NewAccumulator; - // Perform the computations - unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; - BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) - .addReg(ActiveBitsReg); - if (is32BitOpc) { - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), - LaneValueReg) - .addReg(SrcReg) - .addReg(FF1Reg); - if (isFPOp) { - Register LaneValVreg = - MRI.createVirtualRegister(MRI.getRegClass(SrcReg)); - Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg)); - // Get the Lane Value in VGPR to avoid the Constant Bus Restriction - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), - LaneValVreg) - .addReg(LaneValueReg); - BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg) - .addImm(0) // src0 modifier - .addReg(Accumulator->getOperand(0).getReg()) - .addImm(0) // src1 modifier - .addReg(LaneValVreg) - .addImm(0) // clamp - .addImm(0); // omod - NewAccumulator = BuildMI(*ComputeLoop, I, DL, - TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) - .addReg(DstVreg); + Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass); + Register IdentityValReg = MRI.createVirtualRegister(DstRegClass); + Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass); + Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); + Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); + Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register LaneValueReg = MRI.createVirtualRegister(DstRegClass); + + // Create initial values of induction variable from Exec, Accumulator and + // insert branch instr to newly created ComputeBlock + BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg); + if (is32BitOpc) { + uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc); + BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg) + .addImm(IdentityValue); } else { - NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) - .addReg(Accumulator->getOperand(0).getReg()) - .addReg(LaneValueReg); - } - } else { - Register LaneValueLoReg = - MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - Register LaneValueHiReg = - MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); - const TargetRegisterClass *SrcSubRC = - TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0); - MachineOperand Op1L = TII->buildExtractSubRegOrImm( - MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC); - MachineOperand Op1H = TII->buildExtractSubRegOrImm( - MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC); - // lane value input should be in an sgpr - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), - LaneValueLoReg) - .add(Op1L) - .addReg(FF1Reg); - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), - LaneValueHiReg) - .add(Op1H) - .addReg(FF1Reg); - auto LaneValue = BuildMI(*ComputeLoop, I, DL, - TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg) - .addReg(LaneValueLoReg) - .addImm(AMDGPU::sub0) - .addReg(LaneValueHiReg) - .addImm(AMDGPU::sub1); - switch (Opc) { - case AMDGPU::S_OR_B64: - case AMDGPU::S_AND_B64: - case AMDGPU::S_XOR_B64: { - NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) - .addReg(Accumulator->getOperand(0).getReg()) - .addReg(LaneValue->getOperand(0).getReg()) - .setOperandDead(3); // Dead scc - break; - } - case AMDGPU::V_CMP_GT_I64_e64: - case AMDGPU::V_CMP_GT_U64_e64: - case AMDGPU::V_CMP_LT_I64_e64: - case AMDGPU::V_CMP_LT_U64_e64: { - Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass); - Register ComparisonResultReg = - MRI.createVirtualRegister(WaveMaskRegClass); - int SrcIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src); - const TargetRegisterClass *VregClass = - TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx)); - const TargetRegisterClass *VSubRegClass = - TRI->getSubRegisterClass(VregClass, AMDGPU::sub0); - Register AccumulatorVReg = MRI.createVirtualRegister(VregClass); - MachineOperand SrcReg0Sub0 = - TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0), - VregClass, AMDGPU::sub0, VSubRegClass); - MachineOperand SrcReg0Sub1 = - TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0), - VregClass, AMDGPU::sub1, VSubRegClass); - BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), - AccumulatorVReg) - .add(SrcReg0Sub0) - .addImm(AMDGPU::sub0) - .add(SrcReg0Sub1) - .addImm(AMDGPU::sub1); - BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg) - .addReg(LaneValue->getOperand(0).getReg()) - .addReg(AccumulatorVReg); - - unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg) - .addReg(LaneMaskReg) - .addReg(ActiveBitsReg); - - NewAccumulator = BuildMI(*ComputeLoop, I, DL, - TII->get(AMDGPU::S_CSELECT_B64), DstReg) - .addReg(LaneValue->getOperand(0).getReg()) - .addReg(Accumulator->getOperand(0).getReg()); - break; - } - case AMDGPU::V_MIN_F64_e64: - case AMDGPU::V_MIN_NUM_F64_e64: - case AMDGPU::V_MAX_F64_e64: - case AMDGPU::V_MAX_NUM_F64_e64: - case AMDGPU::V_ADD_F64_e64: - case AMDGPU::V_ADD_F64_pseudo_e64: { - int SrcIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src); - const TargetRegisterClass *VregRC = - TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx)); - const TargetRegisterClass *VregSubRC = - TRI->getSubRegisterClass(VregRC, AMDGPU::sub0); - Register AccumulatorVReg = MRI.createVirtualRegister(VregRC); - Register DstVreg = MRI.createVirtualRegister(VregRC); - Register LaneValLo = - MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - Register LaneValHi = - MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg) - .addReg(Accumulator->getOperand(0).getReg()); - unsigned Modifier = + uint64_t IdentityValue = MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64 - ? SISrcMods::NEG - : SISrcMods::NONE; - auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg) - .addImm(Modifier) // src0 modifiers + ? 0x0 // +0.0 for double sub reduction + : getIdentityValueFor64BitWaveReduction(Opc); + BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), + IdentityValReg) + .addImm(IdentityValue); + } + // clang-format off + BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)) + .addMBB(ComputeLoop); + // clang-format on + + // Start constructing ComputeLoop + I = ComputeLoop->begin(); + auto Accumulator = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) + .addReg(IdentityValReg) + .addMBB(&BB); + auto ActiveBits = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) + .addReg(LoopIterator) + .addMBB(&BB); + + I = ComputeLoop->end(); + MachineInstr *NewAccumulator; + // Perform the computations + unsigned SFFOpc = + IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; + BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) + .addReg(ActiveBitsReg); + if (is32BitOpc) { + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), + LaneValueReg) + .addReg(SrcReg) + .addReg(FF1Reg); + if (isFPOp) { + Register LaneValVreg = + MRI.createVirtualRegister(MRI.getRegClass(SrcReg)); + Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg)); + // Get the Lane Value in VGPR to avoid the Constant Bus Restriction + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), + LaneValVreg) + .addReg(LaneValueReg); + BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg) + .addImm(0) // src0 modifier + .addReg(Accumulator->getOperand(0).getReg()) + .addImm(0) // src1 modifier + .addReg(LaneValVreg) + .addImm(0) // clamp + .addImm(0); // omod + NewAccumulator = + BuildMI(*ComputeLoop, I, DL, + TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(DstVreg); + } else { + NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) + .addReg(Accumulator->getOperand(0).getReg()) + .addReg(LaneValueReg); + } + } else { + Register LaneValueLoReg = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register LaneValueHiReg = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register LaneValReg = + MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *SrcSubRC = + TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0); + MachineOperand Op1L = TII->buildExtractSubRegOrImm( + MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC); + MachineOperand Op1H = TII->buildExtractSubRegOrImm( + MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC); + // lane value input should be in an sgpr + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), + LaneValueLoReg) + .add(Op1L) + .addReg(FF1Reg); + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), + LaneValueHiReg) + .add(Op1H) + .addReg(FF1Reg); + auto LaneValue = + BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), + LaneValReg) + .addReg(LaneValueLoReg) + .addImm(AMDGPU::sub0) + .addReg(LaneValueHiReg) + .addImm(AMDGPU::sub1); + switch (Opc) { + case AMDGPU::S_OR_B64: + case AMDGPU::S_AND_B64: + case AMDGPU::S_XOR_B64: { + NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) + .addReg(Accumulator->getOperand(0).getReg()) .addReg(LaneValue->getOperand(0).getReg()) - .addImm(SISrcMods::NONE) // src1 modifiers - .addReg(AccumulatorVReg) - .addImm(SISrcMods::NONE) // clamp - .addImm(SISrcMods::NONE); // omod - auto ReadLaneLo = - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - LaneValLo); - auto ReadLaneHi = - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - LaneValHi); - MachineBasicBlock::iterator Iters = *ReadLaneLo; - MachineOperand Op1L = - TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0), - VregRC, AMDGPU::sub0, VregSubRC); - MachineOperand Op1H = - TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0), - VregRC, AMDGPU::sub1, VregSubRC); - ReadLaneLo.add(Op1L); - ReadLaneHi.add(Op1H); - NewAccumulator = BuildMI(*ComputeLoop, I, DL, - TII->get(TargetOpcode::REG_SEQUENCE), DstReg) - .addReg(LaneValLo) - .addImm(AMDGPU::sub0) - .addReg(LaneValHi) - .addImm(AMDGPU::sub1); - break; + .setOperandDead(3); // Dead scc + break; + } + case AMDGPU::V_CMP_GT_I64_e64: + case AMDGPU::V_CMP_GT_U64_e64: + case AMDGPU::V_CMP_LT_I64_e64: + case AMDGPU::V_CMP_LT_U64_e64: { + Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass); + Register ComparisonResultReg = + MRI.createVirtualRegister(WaveMaskRegClass); + int SrcIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src); + const TargetRegisterClass *VregClass = + TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx)); + const TargetRegisterClass *VSubRegClass = + TRI->getSubRegisterClass(VregClass, AMDGPU::sub0); + Register AccumulatorVReg = MRI.createVirtualRegister(VregClass); + MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Accumulator->getOperand(0), VregClass, AMDGPU::sub0, + VSubRegClass); + MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Accumulator->getOperand(0), VregClass, AMDGPU::sub1, + VSubRegClass); + BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE), + AccumulatorVReg) + .add(SrcReg0Sub0) + .addImm(AMDGPU::sub0) + .add(SrcReg0Sub1) + .addImm(AMDGPU::sub1); + BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg) + .addReg(LaneValue->getOperand(0).getReg()) + .addReg(AccumulatorVReg); + + unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg) + .addReg(LaneMaskReg) + .addReg(ActiveBitsReg); + + NewAccumulator = BuildMI(*ComputeLoop, I, DL, + TII->get(AMDGPU::S_CSELECT_B64), DstReg) + .addReg(LaneValue->getOperand(0).getReg()) + .addReg(Accumulator->getOperand(0).getReg()); + break; + } + case AMDGPU::V_MIN_F64_e64: + case AMDGPU::V_MIN_NUM_F64_e64: + case AMDGPU::V_MAX_F64_e64: + case AMDGPU::V_MAX_NUM_F64_e64: + case AMDGPU::V_ADD_F64_e64: + case AMDGPU::V_ADD_F64_pseudo_e64: { + int SrcIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src); + const TargetRegisterClass *VregRC = + TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx)); + const TargetRegisterClass *VregSubRC = + TRI->getSubRegisterClass(VregRC, AMDGPU::sub0); + Register AccumulatorVReg = MRI.createVirtualRegister(VregRC); + Register DstVreg = MRI.createVirtualRegister(VregRC); + Register LaneValLo = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register LaneValHi = + MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg) + .addReg(Accumulator->getOperand(0).getReg()); + unsigned Modifier = + MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64 + ? SISrcMods::NEG + : SISrcMods::NONE; + auto DstVregInst = + BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg) + .addImm(Modifier) // src0 modifiers + .addReg(LaneValue->getOperand(0).getReg()) + .addImm(SISrcMods::NONE) // src1 modifiers + .addReg(AccumulatorVReg) + .addImm(SISrcMods::NONE) // clamp + .addImm(SISrcMods::NONE); // omod + auto ReadLaneLo = + BuildMI(*ComputeLoop, I, DL, + TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo); + auto ReadLaneHi = + BuildMI(*ComputeLoop, I, DL, + TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi); + MachineBasicBlock::iterator Iters = *ReadLaneLo; + MachineOperand Op1L = TII->buildExtractSubRegOrImm( + Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub0, + VregSubRC); + MachineOperand Op1H = TII->buildExtractSubRegOrImm( + Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub1, + VregSubRC); + ReadLaneLo.add(Op1L); + ReadLaneHi.add(Op1H); + NewAccumulator = BuildMI(*ComputeLoop, I, DL, + TII->get(TargetOpcode::REG_SEQUENCE), DstReg) + .addReg(LaneValLo) + .addImm(AMDGPU::sub0) + .addReg(LaneValHi) + .addImm(AMDGPU::sub1); + break; + } + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: { + NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) + .addReg(Accumulator->getOperand(0).getReg()) + .addReg(LaneValue->getOperand(0).getReg()); + ComputeLoop = + Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop); + break; + } + } } - case AMDGPU::S_ADD_U64_PSEUDO: - case AMDGPU::S_SUB_U64_PSEUDO: { - NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) - .addReg(Accumulator->getOperand(0).getReg()) - .addReg(LaneValue->getOperand(0).getReg()); - ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop); - break; + // Manipulate the iterator to get the next active lane + unsigned BITSETOpc = + IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; + BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) + .addReg(FF1Reg) + .addReg(ActiveBitsReg); + + // Add phi nodes + Accumulator.addReg(DstReg).addMBB(ComputeLoop); + ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop); + + // Creating branching + unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; + BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) + .addReg(NewActiveBitsReg) + .addImm(0); + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) + .addMBB(ComputeLoop); + + RetBB = ComputeEnd; + } else { + assert(ST.hasDPP() && "Sub Target does not support DPP Operations"); + + Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass); + Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass); + Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass); + Register DPPRowShr1 = MRI.createVirtualRegister(SrcRegClass); + Register DPPRowShr2 = MRI.createVirtualRegister(SrcRegClass); + Register DPPRowShr4 = MRI.createVirtualRegister(SrcRegClass); + Register DPPRowShr8 = MRI.createVirtualRegister(SrcRegClass); + Register RowBcast15 = MRI.createVirtualRegister(SrcRegClass); + Register ReducedValSGPR = MRI.createVirtualRegister(DstRegClass); + Register NegatedReducedVal = MRI.createVirtualRegister(DstRegClass); + Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass); + Register UndefExec = MRI.createVirtualRegister(WaveMaskRegClass); + Register FinalDPPResult; + BuildMI(BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec); + + uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), IdentitySGPR) + .addImm(IdentityValue); + BuildMI(BB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR) + .addReg(IdentitySGPR); + + // Set inactive lanes to the identity value. + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SET_INACTIVE_B32), SrcWithIdentity) + .addImm(0) // src0 modifiers + .addReg(SrcReg) // src0 + .addImm(0) // src1 modifiers + .addReg(IdentityVGPR) // identity value for inactive lanes + .addReg(UndefExec); // bool i1 + + unsigned DPPOpc = getDPPOpcForWaveReduction(Opc, ST); + auto BuildDPPMachineInstr = [&](Register Dst, Register Src, + unsigned DPPCtrl) { + BuildMI(BB, MI, DL, TII->get(DPPOpc), Dst) + .addReg(Src) // old + .addReg(Src) // src0 + .addReg(Src) // src1 + .addImm(DPPCtrl) // dpp-ctrl + .addImm(0xf) // row-mask + .addImm(0xf) // bank-mask + .addImm(0); // bound-control + }; + // DPP reduction + BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentity, + AMDGPU::DPP::ROW_SHR_FIRST); + + BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1, + (AMDGPU::DPP::ROW_SHR_FIRST + 1)); + + BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2, + (AMDGPU::DPP::ROW_SHR_FIRST + 3)); + + BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4, + (AMDGPU::DPP::ROW_SHR_FIRST + 7)); + + if (ST.hasDPPBroadcasts()) { + BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15); + } else { + // magic constant: 0x1E0 + // To Set BIT_MODE : bit 15 = 0 + // XOR mask : bit [14:10] = 0 + // OR mask : bit [9:5] = 15 + // AND mask : bit [4:0] = 0 + Register SwizzledValue = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32), SwizzledValue) + .addReg(DPPRowShr8) // addr + .addImm(0x1E0) // swizzle offset (i16) + .addImm(0x0); // gds (i1) + auto ClampInstr = + BuildMI(BB, MI, DL, + TII->get(TII->getVALUOp( + Opc == AMDGPU::S_SUB_I32 + ? static_cast(AMDGPU::S_ADD_I32) + : Opc)), + RowBcast15) + .addReg(DPPRowShr8) + .addReg(SwizzledValue); + if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr)) + ClampInstr.addImm(0); } + FinalDPPResult = RowBcast15; + if (!IsWave32) { + if (ST.hasDPPBroadcasts()) { + BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31); + } else { + Register ShiftedThreadID = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register PermuteByteOffset = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register PermutedValue = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register Lane32Offset = MRI.createVirtualRegister(DstRegClass); + Register WordSizeConst = MRI.createVirtualRegister(DstRegClass); + Register ThreadIDRegLo = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register ThreadIDReg = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + // Get the thread ID. + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64), + ThreadIDRegLo) + .addImm(-1) + .addImm(0); + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64), + ThreadIDReg) + .addImm(-1) + .addReg(ThreadIDRegLo); + // shift each lane over by 32 positions, so value in 31st lane is + // present in 63rd lane. + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset) + .addImm(0x20); + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), ShiftedThreadID) + .addReg(ThreadIDReg) + .addReg(Lane32Offset) + .addImm(0); // clamp + // multiply by reg size. + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst) + .addImm(0x4); + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64), + PermuteByteOffset) + .addReg(WordSizeConst) + .addReg(ShiftedThreadID); + // Permute the lanes + BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32), PermutedValue) + .addReg(PermuteByteOffset) // addr + .addReg(RowBcast15) // data + .addImm(0); // offset + auto ClampInstr = + BuildMI(BB, MI, DL, + TII->get(TII->getVALUOp( + Opc == AMDGPU::S_SUB_I32 + ? static_cast(AMDGPU::S_ADD_I32) + : Opc)), + RowBcast31) + .addReg(RowBcast15) + .addReg(PermutedValue); + if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr)) + ClampInstr.addImm(0); + } + FinalDPPResult = RowBcast31; } + // The final reduced value is in the last lane. + BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), ReducedValSGPR) + .addReg(FinalDPPResult) + .addImm(ST.getWavefrontSize() - 1); + if (Opc == AMDGPU::S_SUB_I32) + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal) + .addImm(0) + .addReg(ReducedValSGPR); + // Mark the final result as a whole-wave-mode calculation. + BuildMI(BB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg) + .addReg(Opc == AMDGPU::S_SUB_I32 ? NegatedReducedVal + : ReducedValSGPR); + RetBB = &BB; } - // Manipulate the iterator to get the next active lane - unsigned BITSETOpc = - IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; - BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) - .addReg(FF1Reg) - .addReg(ActiveBitsReg); - - // Add phi nodes - Accumulator.addReg(DstReg).addMBB(ComputeLoop); - ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop); - - // Creating branching - unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; - BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) - .addReg(NewActiveBitsReg) - .addImm(0); - BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) - .addMBB(ComputeLoop); - - RetBB = ComputeEnd; } MI.eraseFromParent(); return RetBB; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index de118f3dbbf1..45b8a49392e0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5948,11 +5948,21 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return true; } +unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::S_MOV_B32) { + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg()) + ? AMDGPU::COPY + : AMDGPU::V_MOV_B32_e32; + } + return getVALUOp(MI.getOpcode()); +} + // It is more readable to list mapped opcodes on the same line. // clang-format off -unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { - switch (MI.getOpcode()) { +unsigned SIInstrInfo::getVALUOp(unsigned Opc) const { + switch (Opc) { default: return AMDGPU::INSTRUCTION_LIST_END; case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; case AMDGPU::COPY: return AMDGPU::COPY; @@ -5962,12 +5972,6 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM; case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM; case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM; - case AMDGPU::S_MOV_B32: { - const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); - return MI.getOperand(1).isReg() || - RI.isAGPR(MRI, MI.getOperand(0).getReg()) ? - AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; - } case AMDGPU::S_ADD_I32: return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; case AMDGPU::S_ADDC_U32: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 87d51ffe8366..8bb2b19592a3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1361,6 +1361,7 @@ public: StringRef &ErrInfo) const override; unsigned getVALUOp(const MachineInstr &MI) const; + unsigned getVALUOp(unsigned Opc) const; void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 2846398c4d82..b3b8d9863ca0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -354,7 +354,7 @@ multiclass let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, UseNamedOperandTable = 1, Uses = [EXEC] in { def !toupper(Op) #"_PSEUDO_" #DataType : VPseudoInstSI<(outs RetReg : $sdst), - (ins Reg : $src, VSrc_b32 : $strategy), + (ins Reg : $src, i32imm : $strategy), [(set ty : $sdst, (!cast("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {} } } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll index 236fcc95f223..ddb2840e959d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll @@ -133,9 +133,9 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) { ; GFX942-NEXT: s_mov_b32 s33, s32 ; GFX942-NEXT: s_add_i32 s32, s32, 16 ; GFX942-NEXT: s_and_b32 s0, s0, -16 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_lshl_b32 s0, s0, 6 ; GFX942-NEXT: s_mov_b32 s1, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_add_i32 s32, s1, s0 ; GFX942-NEXT: scratch_store_dword off, v0, s1 ; GFX942-NEXT: s_endpgm @@ -152,54 +152,74 @@ define amdgpu_cs_chain void @test_alloca_var(i32 %count) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 -; GFX12-NEXT: s_mov_b64 s[0:1], exec -; GFX12-NEXT: s_mov_b32 s2, 0 +; GFX12-NEXT: v_lshl_add_u32 v3, v8, 2, 15 ; GFX12-NEXT: s_mov_b32 s33, s32 ; GFX12-NEXT: s_add_co_i32 s32, s32, 16 -; GFX12-NEXT: v_and_b32_e32 v1, -16, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, -16, v3 +; GFX12-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_ctz_i32_b64 s3, s[0:1] -; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_readlane_b32 s4, v1, s3 -; GFX12-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX12-NEXT: s_max_u32 s2, s2, s4 -; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX12-NEXT: ; %bb.2: +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v3, s[0:1] +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: v_add_nc_u32_e32 v2, 32, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: v_mul_lo_u32 v2, 4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX12-NEXT: ds_permute_b32 v1, v2, v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s2, v0, 63 +; GFX12-NEXT: s_mov_b64 exec, s[0:1] ; GFX12-NEXT: s_mov_b32 s0, s32 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_lshl_add_u32 v1, s2, 6, s0 -; GFX12-NEXT: scratch_store_b32 off, v0, s0 -; GFX12-NEXT: v_readfirstlane_b32 s32, v1 +; GFX12-NEXT: v_lshl_add_u32 v3, s2, 6, s0 +; GFX12-NEXT: scratch_store_b32 off, v4, s0 +; GFX12-NEXT: v_readfirstlane_b32 s32, v3 ; GFX12-NEXT: s_endpgm ; ; GFX942-LABEL: test_alloca_var: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 -; GFX942-NEXT: v_and_b32_e32 v1, -16, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_mov_b64 s[0:1], exec -; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: v_lshl_add_u32 v1, v8, 2, 15 ; GFX942-NEXT: s_mov_b32 s33, s32 ; GFX942-NEXT: s_add_i32 s32, s32, 16 -; GFX942-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: v_readlane_b32 s4, v1, s3 -; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX942-NEXT: s_max_u32 s2, s2, s4 -; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX942-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX942-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readlane_b32 s2, v0, 63 +; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: s_mov_b32 s0, s32 ; GFX942-NEXT: v_mov_b32_e32 v1, s0 ; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1 -; GFX942-NEXT: scratch_store_dword off, v0, s0 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_readfirstlane_b32 s32, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: scratch_store_dword off, v1, s0 ; GFX942-NEXT: s_endpgm %v = alloca i32, i32 %count, align 4, addrspace(5) store i32 0, ptr addrspace(5) %v, align 4 @@ -302,8 +322,8 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count) ; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 ; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s3, s32 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_add_i32 s32, s3, s2 ; GFX942-NEXT: scratch_store_dword off, v0, s3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) @@ -323,35 +343,50 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 -; GFX12-NEXT: s_mov_b64 s[0:1], exec -; GFX12-NEXT: s_mov_b32 s2, 0 +; GFX12-NEXT: v_lshl_add_u32 v3, v8, 2, 15 ; GFX12-NEXT: s_mov_b32 s33, s32 ; GFX12-NEXT: s_add_co_i32 s32, s32, 16 -; GFX12-NEXT: v_and_b32_e32 v1, -16, v0 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, -16, v3 +; GFX12-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v3, s[0:1] +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 +; GFX12-NEXT: s_getpc_b64 s[2:3] ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_readlane_b32 s4, v1, s3 -; GFX12-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX12-NEXT: s_max_u32 s2, s2, s4 +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cbranch_scc1 .LBB6_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_getpc_b64 s[0:1] +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: v_add_nc_u32_e32 v2, 32, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: v_mul_lo_u32 v2, 4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX12-NEXT: ds_permute_b32 v1, v2, v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX12-NEXT: s_mov_b64 exec, s[0:1] +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readlane_b32 s4, v0, 63 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 +; GFX12-NEXT: s_mov_b64 exec, s[2:3] +; GFX12-NEXT: s_mov_b32 s2, s32 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 -; GFX12-NEXT: s_mov_b32 s3, s32 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-NEXT: v_lshl_add_u32 v1, s2, 6, s3 -; GFX12-NEXT: scratch_store_b32 off, v0, s3 -; GFX12-NEXT: v_readfirstlane_b32 s32, v1 +; GFX12-NEXT: v_lshl_add_u32 v3, s4, 6, s2 +; GFX12-NEXT: scratch_store_b32 off, v4, s2 +; GFX12-NEXT: v_readfirstlane_b32 s32, v3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -360,21 +395,27 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) { ; GFX942-LABEL: test_alloca_and_call_var: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 -; GFX942-NEXT: v_and_b32_e32 v1, -16, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_mov_b64 s[0:1], exec -; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: v_lshl_add_u32 v1, v8, 2, 15 ; GFX942-NEXT: s_mov_b32 s33, s32 ; GFX942-NEXT: s_add_i32 s32, s32, 16 -; GFX942-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: v_readlane_b32 s4, v1, s3 -; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX942-NEXT: s_max_u32 s2, s2, s4 -; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX942-NEXT: s_cbranch_scc1 .LBB6_1 -; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX942-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readlane_b32 s2, v0, 63 +; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: s_getpc_b64 s[0:1] ; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 ; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 @@ -382,8 +423,10 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) { ; GFX942-NEXT: s_mov_b32 s3, s32 ; GFX942-NEXT: v_mov_b32_e32 v1, s3 ; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1 -; GFX942-NEXT: scratch_store_dword off, v0, s3 +; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_readfirstlane_b32 s32, v1 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: scratch_store_dword off, v1, s3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX942-NEXT: s_endpgm @@ -467,13 +510,13 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) ; GFX12-NEXT: s_mov_b32 s4, s32 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_lshl_b32 s0, s0, 6 -; GFX12-NEXT: v_mov_b32_e32 v40, 0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_add_co_i32 s32, s4, s0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX12-NEXT: scratch_store_b32 off, v40, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: scratch_store_b32 off, v0, s4 ; GFX12-NEXT: s_endpgm ; ; GFX942-LABEL: test_call_and_alloca_var_uniform: @@ -489,11 +532,11 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) ; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX942-NEXT: s_mov_b32 s4, s32 -; GFX942-NEXT: v_mov_b32_e32 v40, 0 ; GFX942-NEXT: s_add_i32 s32, s4, s2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX942-NEXT: scratch_store_dword off, v40, s4 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: scratch_store_dword off, v0, s4 ; GFX942-NEXT: s_endpgm %v = alloca i32, i32 %count, align 4, addrspace(5) call amdgpu_gfx void @foo() @@ -509,71 +552,93 @@ define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15 -; GFX12-NEXT: v_mov_b32_e32 v40, 0 -; GFX12-NEXT: s_mov_b64 s[0:1], exec -; GFX12-NEXT: s_mov_b32 s2, 0 +; GFX12-NEXT: v_lshl_add_u32 v3, v8, 2, 15 ; GFX12-NEXT: s_mov_b32 s33, s32 -; GFX12-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX12-NEXT: s_add_co_i32 s32, s32, 16 -; GFX12-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, -16, v3 +; GFX12-NEXT: s_or_saveexec_b64 s[0:1], -1 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_ctz_i32_b64 s3, s[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v3, s[0:1] +; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 +; GFX12-NEXT: s_getpc_b64 s[2:3] ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: v_readlane_b32 s4, v0, s3 -; GFX12-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX12-NEXT: s_max_u32 s2, s2, s4 +; GFX12-NEXT: s_sext_i32_i16 s3, s3 +; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 ; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX12-NEXT: ; %bb.2: -; GFX12-NEXT: s_getpc_b64 s[0:1] -; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_sext_i32_i16 s1, s1 -; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 -; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) -; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 -; GFX12-NEXT: s_mov_b32 s4, s32 -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX12-NEXT: v_lshl_add_u32 v0, s2, 6, s4 +; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: v_add_nc_u32_e32 v2, 32, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: v_mul_lo_u32 v2, 4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX12-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX12-NEXT: ds_permute_b32 v1, v2, v0 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX12-NEXT: s_mov_b64 exec, s[0:1] +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX12-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_readfirstlane_b32 s32, v0 +; GFX12-NEXT: v_readlane_b32 s4, v0, 63 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: s_mov_b64 exec, s[2:3] +; GFX12-NEXT: s_mov_b32 s5, s32 +; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0) +; GFX12-NEXT: v_lshl_add_u32 v3, s4, 6, s5 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readfirstlane_b32 s32, v3 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX12-NEXT: scratch_store_b32 off, v40, s4 +; GFX12-NEXT: v_mov_b32_e32 v3, 0 +; GFX12-NEXT: scratch_store_b32 off, v3, s5 ; GFX12-NEXT: s_endpgm ; ; GFX942-LABEL: test_call_and_alloca_var: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15 -; GFX942-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX942-NEXT: v_mov_b32_e32 v40, 0 -; GFX942-NEXT: s_mov_b64 s[0:1], exec -; GFX942-NEXT: s_mov_b32 s2, 0 +; GFX942-NEXT: v_lshl_add_u32 v1, v8, 2, 15 ; GFX942-NEXT: s_mov_b32 s33, s32 ; GFX942-NEXT: s_add_i32 s32, s32, 16 -; GFX942-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1] -; GFX942-NEXT: v_readlane_b32 s4, v0, s3 -; GFX942-NEXT: s_bitset0_b64 s[0:1], s3 -; GFX942-NEXT: s_max_u32 s2, s2, s4 -; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX942-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX942-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[0:1] +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_readlane_b32 s2, v0, 63 +; GFX942-NEXT: s_mov_b64 exec, s[0:1] ; GFX942-NEXT: s_getpc_b64 s[0:1] ; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 ; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX942-NEXT: s_mov_b32 s4, s32 -; GFX942-NEXT: v_mov_b32_e32 v0, s4 -; GFX942-NEXT: v_lshl_add_u32 v0, s2, 6, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, s4 +; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1 ; GFX942-NEXT: s_nop 0 -; GFX942-NEXT: v_readfirstlane_b32 s32, v0 +; GFX942-NEXT: v_readfirstlane_b32 s32, v1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX942-NEXT: scratch_store_dword off, v40, s4 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: scratch_store_dword off, v1, s4 ; GFX942-NEXT: s_endpgm %v = alloca i32, i32 %count, align 4, addrspace(5) call amdgpu_gfx void @foo() diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index d19a260db355..95c1dd4d35f8 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -248,20 +248,26 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() { ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 ; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400 -; GFX9-SDAG-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s6, v1, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 @@ -273,54 +279,64 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() { ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent: ; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400 ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400 -; GFX9-GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX9-GISEL-NEXT: ; %bb.2: -; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s7, v1, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s7, 6 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, 16 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX11-SDAG-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v1, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_endpgm @@ -328,28 +344,31 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent() { ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s32, 16 +; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB3_1 -; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v1, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s2, 5 +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -366,24 +385,30 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned ; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 ; GFX9-SDAG-NEXT: s_and_b32 s6, s4, 0xffffe000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0 -; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 -; GFX9-SDAG-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8 -; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s7, v1, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc -; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-SDAG-NEXT: v_lshl_add_u32 v2, s7, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v2 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x1bc +; GFX9-SDAG-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_endpgm ; @@ -391,85 +416,98 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_over_aligned ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 ; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x2000 -; GFX9-GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s6, v1, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff ; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000 ; GFX9-GISEL-NEXT: s_lshl_b32 s4, s6, 6 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_movk_i32 s32, 0x80 -; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo -; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff -; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 ; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_movk_i32 s32, 0x80 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX11-SDAG-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 -; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX11-SDAG-NEXT: ; %bb.2: +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v1, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x1bc +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_over_aligned: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 ; GFX11-GISEL-NEXT: s_movk_i32 s32, 0x80 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB4_1 -; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s1, v1, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc -; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX11-GISEL-NEXT: s_add_u32 s0, s32, 0xfff +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-GISEL-NEXT: s_and_b32 s0, s0, 0xfffff000 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -482,20 +520,26 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 -; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 ; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 ; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x400 -; GFX9-SDAG-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s6, v1, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 @@ -507,52 +551,63 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned: ; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400 ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 4, 15 -; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x400 -; GFX9-GISEL-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX9-GISEL-NEXT: ; %bb.2: -; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 +; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s7, v1, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s7, 6 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s4 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, 16 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-SDAG-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v1, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x29a +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_endpgm @@ -560,28 +615,31 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_divergent_under_aligne ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_divergent_under_aligned: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s32, 16 +; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 4, 15 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB5_1 -; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v1, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a -; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s2, 5 +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_endpgm %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -596,42 +654,48 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17 ; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 ; GFX9-SDAG-NEXT: s_mov_b32 s33, 0 +; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x2000 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0 ; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15 ; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff ; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 +; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0xfffff000 ; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: s_add_i32 s32, s8, s5 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec -; GFX9-SDAG-NEXT: s_add_i32 s32, s9, s5 -; GFX9-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s5, s[6:7] -; GFX9-SDAG-NEXT: v_readlane_b32 s10, v0, s5 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s5 -; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB6_2 -; GFX9-SDAG-NEXT: ; %bb.3: -; GFX9-SDAG-NEXT: s_mov_b32 s5, s32 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s5, v1, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-SDAG-NEXT: s_mov_b32 s6, s32 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s5, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 3 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-SDAG-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s5 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: .LBB6_4: ; %bb.1 +; GFX9-SDAG-NEXT: .LBB6_2: ; %bb.1 ; GFX9-SDAG-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-SDAG-NEXT: s_and_b32 s4, s4, -16 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 @@ -650,42 +714,48 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x2000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x2000 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.0 ; GFX9-GISEL-NEXT: s_lshl2_add_u32 s5, s5, 15 ; GFX9-GISEL-NEXT: s_and_b32 s5, s5, -16 -; GFX9-GISEL-NEXT: s_lshl_b32 s6, s5, 6 -; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0xfff -; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xfffff000 +; GFX9-GISEL-NEXT: s_add_u32 s6, s32, 0xfff +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 6 +; GFX9-GISEL-NEXT: s_and_b32 s8, s6, 0xfffff000 +; GFX9-GISEL-NEXT: s_add_u32 s32, s8, s5 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s6 +; GFX9-GISEL-NEXT: s_mov_b32 s5, s32 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9-GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7] -; GFX9-GISEL-NEXT: v_readlane_b32 s10, v0, s9 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s9 -; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s10 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB6_2 -; GFX9-GISEL-NEXT: ; %bb.3: -; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[6:7] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s9, v1, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 3 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: s_lshl_b32 s7, s8, 6 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s9, 6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s7 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: .LBB6_4: ; %bb.1 +; GFX9-GISEL-NEXT: .LBB6_2: ; %bb.1 ; GFX9-GISEL-NEXT: s_lshl2_add_u32 s4, s4, 15 ; GFX9-GISEL-NEXT: s_mov_b32 s5, s32 ; GFX9-GISEL-NEXT: s_and_b32 s4, s4, -16 @@ -694,60 +764,66 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-SDAG-NEXT: s_mov_b32 s2, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s33, 0 ; GFX11-SDAG-NEXT: s_movk_i32 s32, 0x80 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 ; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff -; GFX11-SDAG-NEXT: s_and_b32 s4, s1, -16 -; GFX11-SDAG-NEXT: s_and_b32 s1, s3, 0xfffff800 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_lshl_b32 s3, s4, 5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX11-SDAG-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4 -; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4 -; GFX11-SDAG-NEXT: s_max_u32 s2, s2, s5 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB6_2 -; GFX11-SDAG-NEXT: ; %bb.3: -; GFX11-SDAG-NEXT: s_mov_b32 s3, s32 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s2, 5, s3 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: s_and_b32 s1, s1, -16 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s4, v1, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-SDAG-NEXT: s_and_b32 s2, s3, 0xfffff800 +; GFX11-SDAG-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 +; GFX11-SDAG-NEXT: s_add_i32 s32, s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s3 dlc +; GFX11-SDAG-NEXT: scratch_store_b32 off, v4, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s4, 5, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: .LBB6_4: ; %bb.1 +; GFX11-SDAG-NEXT: .LBB6_2: ; %bb.1 ; GFX11-SDAG-NEXT: s_lshl2_add_u32 s0, s0, 15 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2 ; GFX11-SDAG-NEXT: s_and_b32 s0, s0, -16 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_add_i32 s32, s1, s0 ; GFX11-SDAG-NEXT: s_endpgm @@ -755,50 +831,55 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_multiple_allocas(i32 % ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_multiple_allocas: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 ; GFX11-GISEL-NEXT: s_movk_i32 s32, 0x80 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB6_4 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.0 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15 -; GFX11-GISEL-NEXT: s_add_u32 s3, s32, 0x7ff -; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: s_lshl_b32 s4, s1, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-GISEL-NEXT: s_and_b32 s1, s3, 0xfffff800 -; GFX11-GISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s4 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB6_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s5, v0, s4 -; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB6_2 -; GFX11-GISEL-NEXT: ; %bb.3: -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 -; GFX11-GISEL-NEXT: s_mov_b32 s3, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s2, s2, 5 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s2 +; GFX11-GISEL-NEXT: s_lshl2_add_u32 s1, s1, 15 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX11-GISEL-NEXT: s_add_u32 s2, s32, 0x7ff +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, -16 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xfffff800 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s1 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s2 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s3 dlc +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s2, s3, 5 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v3, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_u32 s32, s3, s2 -; GFX11-GISEL-NEXT: .LBB6_4: ; %bb.1 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s2 +; GFX11-GISEL-NEXT: .LBB6_2: ; %bb.1 ; GFX11-GISEL-NEXT: s_lshl2_add_u32 s0, s0, 15 -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v3, 2 ; GFX11-GISEL-NEXT: s_and_b32 s0, s0, -16 ; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-GISEL-NEXT: scratch_store_b32 off, v3, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 ; GFX11-GISEL-NEXT: s_endpgm @@ -830,20 +911,26 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x1000 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, 0 -; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec -; GFX9-SDAG-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[6:7] -; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s8 -; GFX9-SDAG-NEXT: s_max_u32 s4, s4, s9 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB7_2 -; GFX9-SDAG-NEXT: ; %bb.3: +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-SDAG-NEXT: s_mov_b32 s6, s32 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s4, 6, v0 @@ -851,71 +938,77 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_5 -; GFX9-SDAG-NEXT: .LBB7_4: ; %bb.0 +; GFX9-SDAG-NEXT: s_cbranch_execnz .LBB7_3 +; GFX9-SDAG-NEXT: .LBB7_2: ; %bb.0 ; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0xfff ; GFX9-SDAG-NEXT: s_lshl2_add_u32 s5, s5, 15 ; GFX9-SDAG-NEXT: s_and_b32 s4, s4, 0xfffff000 ; GFX9-SDAG-NEXT: s_and_b32 s5, s5, -16 ; GFX9-SDAG-NEXT: s_lshl_b32 s5, s5, 6 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-SDAG-NEXT: s_add_i32 s32, s4, s5 -; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-SDAG-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: .LBB7_5: ; %bb.2 +; GFX9-SDAG-NEXT: .LBB7_3: ; %bb.2 ; GFX9-SDAG-NEXT: s_endpgm -; GFX9-SDAG-NEXT: .LBB7_6: -; GFX9-SDAG-NEXT: s_branch .LBB7_4 +; GFX9-SDAG-NEXT: .LBB7_4: +; GFX9-SDAG-NEXT: s_branch .LBB7_2 ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow: ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GFX9-GISEL-NEXT: s_add_u32 s0, s0, s17 ; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s33, 0 +; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x1000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 ; GFX9-GISEL-NEXT: s_mov_b32 s4, 1 -; GFX9-GISEL-NEXT: s_movk_i32 s32, 0x1000 -; GFX9-GISEL-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX9-GISEL-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.1 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s8, s32 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9-GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[6:7] -; GFX9-GISEL-NEXT: v_readlane_b32 s9, v0, s4 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s4 -; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s9 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB7_2 -; GFX9-GISEL-NEXT: ; %bb.3: -; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s6, s8, 6 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s6 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX9-GISEL-NEXT: .LBB7_4: ; %Flow +; GFX9-GISEL-NEXT: s_nop 0 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s9, v1, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s9, 6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-GISEL-NEXT: s_add_u32 s32, s8, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: .LBB7_2: ; %Flow ; GFX9-GISEL-NEXT: s_xor_b32 s4, s4, 1 ; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB7_6 -; GFX9-GISEL-NEXT: ; %bb.5: ; %bb.0 +; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB7_4 +; GFX9-GISEL-NEXT: ; %bb.3: ; %bb.0 ; GFX9-GISEL-NEXT: s_lshl2_add_u32 s4, s5, 15 ; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0xfff ; GFX9-GISEL-NEXT: s_and_b32 s4, s4, -16 ; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xfffff000 ; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 6 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: .LBB7_6: ; %bb.2 +; GFX9-GISEL-NEXT: .LBB7_4: ; %bb.2 ; GFX9-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_kernel_control_flow: @@ -925,31 +1018,35 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX11-SDAG-NEXT: s_mov_b32 s32, 64 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc0 .LBB7_6 +; GFX11-SDAG-NEXT: s_cbranch_scc0 .LBB7_4 ; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.1 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX11-SDAG-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 -; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s4 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB7_2 -; GFX11-SDAG-NEXT: ; %bb.3: -; GFX11-SDAG-NEXT: s_mov_b32 s2, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s2 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s2, v1, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s2, 5, s3 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s3 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_5 -; GFX11-SDAG-NEXT: .LBB7_4: ; %bb.0 +; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_3 +; GFX11-SDAG-NEXT: .LBB7_2: ; %bb.0 ; GFX11-SDAG-NEXT: s_lshl2_add_u32 s1, s1, 15 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 2 ; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0x7ff @@ -959,50 +1056,53 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_add_i32 s32, s0, s1 -; GFX11-SDAG-NEXT: .LBB7_5: ; %bb.2 +; GFX11-SDAG-NEXT: .LBB7_3: ; %bb.2 ; GFX11-SDAG-NEXT: s_endpgm -; GFX11-SDAG-NEXT: .LBB7_6: -; GFX11-SDAG-NEXT: s_branch .LBB7_4 +; GFX11-SDAG-NEXT: .LBB7_4: +; GFX11-SDAG-NEXT: s_branch .LBB7_2 ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_kernel_control_flow: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s33, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s32, 64 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX11-GISEL-NEXT: s_mov_b32 s0, 1 -; GFX11-GISEL-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-GISEL-NEXT: s_cbranch_scc0 .LBB7_2 ; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.1 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX11-GISEL-NEXT: s_mov_b32 s2, s32 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB7_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s3, s0 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3 -; GFX11-GISEL-NEXT: s_bitset0_b32 s0, s3 -; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s4 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB7_2 -; GFX11-GISEL-NEXT: ; %bb.3: +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 1 -; GFX11-GISEL-NEXT: s_mov_b32 s3, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s2, 5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s3, s0 +; GFX11-GISEL-NEXT: s_lshl_b32 s3, s3, 5 ; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s3 dlc +; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s3 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s2 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: .LBB7_4: ; %Flow +; GFX11-GISEL-NEXT: .LBB7_2: ; %Flow ; GFX11-GISEL-NEXT: s_xor_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB7_6 -; GFX11-GISEL-NEXT: ; %bb.5: ; %bb.0 +; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB7_4 +; GFX11-GISEL-NEXT: ; %bb.3: ; %bb.0 ; GFX11-GISEL-NEXT: s_lshl2_add_u32 s0, s1, 15 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 2 ; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0x7ff @@ -1012,7 +1112,7 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i ; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: .LBB7_6: ; %bb.2 +; GFX11-GISEL-NEXT: .LBB7_4: ; %bb.2 ; GFX11-GISEL-NEXT: s_endpgm entry: %cond = icmp eq i32 %n, 0 @@ -1034,21 +1134,30 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s7, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 -; GFX9-SDAG-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s6, v1, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 @@ -1057,95 +1166,143 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) { ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s33, s7 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s8, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 -; GFX9-GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX9-GISEL-NEXT: ; %bb.2: -; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 -; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s7, v1, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s7, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s4 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s33, s8 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s33 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 -; GFX11-SDAG-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v1, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-SDAG-NEXT: scratch_load_b32 v1, off, s33 +; GFX11-SDAG-NEXT: scratch_load_b32 v2, off, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s2 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s3, s33 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s33 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 -; GFX11-GISEL-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB8_1 -; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v1, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s2, 5 +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-GISEL-NEXT: scratch_load_b32 v1, off, s33 +; GFX11-GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s3 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, addrspace(5) store volatile i32 123, ptr addrspace(5) %alloca @@ -1156,138 +1313,198 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s10, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s34 +; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s9, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 ; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_and_b32 s6, s4, 0xffffe000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0 -; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GFX9-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8 -; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s7, v1, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 10 -; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-SDAG-NEXT: v_lshl_add_u32 v2, s7, 6, v0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v2 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 10 +; GFX9-SDAG-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX9-SDAG-NEXT: s_mov_b32 s34, s11 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s10 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s9 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s33, s8 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: s_mov_b32 s7, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s8, s34 ; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000 -; GFX9-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s6, v1, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff ; GFX9-GISEL-NEXT: s_lshl_b32 s4, s6, 6 ; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000 ; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 10 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 -; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 -; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s8 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s33, s7 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f -; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s34 ; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v1, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff -; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 10 ; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 -; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 -; GFX11-SDAG-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 -; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX11-SDAG-NEXT: ; %bb.2: ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 10 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s3 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-SDAG-NEXT: scratch_load_b32 v1, off, s33 +; GFX11-SDAG-NEXT: scratch_load_b32 v2, off, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s2 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s2, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f -; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s33 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s3, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 -; GFX11-GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB9_1 -; GFX11-GISEL-NEXT: ; %bb.2: -; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s1, v1, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_add_u32 s0, s32, 0xfff ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 10 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 -; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s3 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-GISEL-NEXT: scratch_load_b32 v1, off, s33 +; GFX11-GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s2 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, align 128, addrspace(5) store volatile i32 10, ptr addrspace(5) %alloca @@ -1298,21 +1515,30 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s7, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 -; GFX9-SDAG-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s6, v1, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 @@ -1321,95 +1547,143 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) { ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s33, s7 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s8, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 -; GFX9-GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX9-GISEL-NEXT: ; %bb.2: -; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 -; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s7, v1, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s7, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s4 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 22 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s33, s8 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s33 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 -; GFX11-SDAG-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 22 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v1, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 22 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-SDAG-NEXT: scratch_load_b32 v1, off, s33 +; GFX11-SDAG-NEXT: scratch_load_b32 v2, off, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s2 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s3, s33 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s33 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 -; GFX11-GISEL-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v1, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 22 -; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s2, 5 +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-GISEL-NEXT: scratch_load_b32 v1, off, s33 +; GFX11-GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s3 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i32 %n, align 2, addrspace(5) store volatile i32 22, ptr addrspace(5) %alloca @@ -1420,124 +1694,180 @@ define void @test_dynamic_stackalloc_device_divergent() { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s7, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 -; GFX9-SDAG-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB11_1 -; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s6, v0, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v1 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s33, s7 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s8, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v31 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 -; GFX9-GISEL-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB11_1 -; GFX9-GISEL-NEXT: ; %bb.2: -; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 -; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s7, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x7b +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s33, s8 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s33 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v31 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX11-SDAG-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, v2, 2, 15 +; GFX11-SDAG-NEXT: v_and_b32_e32 v2, 0x1ff0, v2 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB11_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x7b -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v0, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s1, 5, s0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-SDAG-NEXT: scratch_load_b32 v0, off, s33 +; GFX11-SDAG-NEXT: scratch_load_b32 v1, off, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s2 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s3, s33 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 -; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v31 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v2, v2, 2, 15 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, -16, v2 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB11_1 -; GFX11-GISEL-NEXT: ; %bb.2: -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v0, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s2, 5 +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s0 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-GISEL-NEXT: scratch_load_b32 v0, off, s33 +; GFX11-GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s3 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, addrspace(5) @@ -1549,142 +1879,202 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s10, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s34 +; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s9, s34 ; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x4000 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 +; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31 ; GFX9-SDAG-NEXT: s_add_i32 s4, s32, 0x1fff -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 ; GFX9-SDAG-NEXT: s_and_b32 s6, s4, 0xffffe000 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0 -; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xffffe000 -; GFX9-SDAG-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s8, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s9, v0, s8 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s8 -; GFX9-SDAG-NEXT: s_max_u32 s7, s7, s9 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB12_1 -; GFX9-SDAG-NEXT: ; %bb.2: -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s7, 6, v0 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc -; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v2, s7, 6, v1 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v2 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x1bc +; GFX9-SDAG-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX9-SDAG-NEXT: s_mov_b32 s34, s11 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s10 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s9 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s33, s8 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 +; GFX9-GISEL-NEXT: s_mov_b32 s7, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0x1fc0 -; GFX9-GISEL-NEXT: s_mov_b32 s10, s34 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xffffe000 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s8, s34 ; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x4000 -; GFX9-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB12_1 -; GFX9-GISEL-NEXT: ; %bb.2: +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s6, v0, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-GISEL-NEXT: s_add_u32 s5, s32, 0x1fff ; GFX9-GISEL-NEXT: s_lshl_b32 s4, s6, 6 ; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffffe000 ; GFX9-GISEL-NEXT: s_add_u32 s32, s5, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x1bc +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-GISEL-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 -; GFX9-GISEL-NEXT: s_mov_b32 s34, s10 -; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s8 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s33, s7 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 0x7f -; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 -; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo -; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff -; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_and_b32 s33, s33, 0xffffff80 -; GFX11-SDAG-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 +; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x100 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, v2, 2, 15 +; GFX11-SDAG-NEXT: v_and_b32_e32 v2, 0x1ff0, v2 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 -; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB12_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x1bc -; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v0, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_add_i32 s0, s32, 0xfff +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x1bc +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s1, 5, s0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s3 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-SDAG-NEXT: scratch_load_b32 v0, off, s33 +; GFX11-SDAG-NEXT: scratch_load_b32 v1, off, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s2 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s2, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 0x7f -; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_and_b32 s33, s33, 0xffffff80 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX11-GISEL-NEXT: s_mov_b32 s3, s34 ; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x100 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshl_add_u32 v2, v2, 2, 15 +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, -16, v2 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB12_1 -; GFX11-GISEL-NEXT: ; %bb.2: -; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0xfff -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x1bc -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff000 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s1, v0, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_add_u32 s0, s32, 0xfff +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x1bc +; GFX11-GISEL-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s0 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 -; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s3 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-GISEL-NEXT: scratch_load_b32 v0, off, s33 +; GFX11-GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s2 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, align 128, addrspace(5) @@ -1696,124 +2086,180 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s7, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 -; GFX9-SDAG-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB13_1 -; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s6, v0, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x29a -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s6, 6, v1 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a +; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s33, s7 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s8, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v31 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 -; GFX9-GISEL-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB13_1 -; GFX9-GISEL-NEXT: ; %bb.2: -; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 -; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_lshl_add_u32 v1, v1, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s7, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x29a +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s33, s8 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s33 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-SDAG-NEXT: scratch_store_b32 off, v0, s33 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v31 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX11-SDAG-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, v2, 2, 15 +; GFX11-SDAG-NEXT: v_and_b32_e32 v2, 0x1ff0, v2 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB13_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v0, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x29a +; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s1, 5, s0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-SDAG-NEXT: scratch_load_b32 v0, off, s33 +; GFX11-SDAG-NEXT: scratch_load_b32 v1, off, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s2 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s3, s33 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 -; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v31 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v2, v2, 2, 15 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB13_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, -16, v2 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB13_1 -; GFX11-GISEL-NEXT: ; %bb.2: -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a -; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v0, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x29a +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s2, 5 +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s0 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-GISEL-NEXT: scratch_load_b32 v0, off, s33 +; GFX11-GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s3 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = call i32 @llvm.amdgcn.workitem.id.x() %alloca = alloca i32, i32 %idx, align 2, addrspace(5) @@ -1825,74 +2271,94 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s13, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 -; GFX9-SDAG-NEXT: s_mov_b32 s14, s34 -; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s11, s34 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x3000 ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-SDAG-NEXT: s_cbranch_execz .LBB14_6 +; GFX9-SDAG-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.0 ; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff ; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 +; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0xfffff000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 -; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec -; GFX9-SDAG-NEXT: s_mov_b32 s10, 0 -; GFX9-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s11, s[6:7] -; GFX9-SDAG-NEXT: v_readlane_b32 s12, v1, s11 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s11 -; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s12 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_2 -; GFX9-SDAG-NEXT: ; %bb.3: -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 -; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX9-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1 -; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec -; GFX9-SDAG-NEXT: s_mov_b32 s10, 0 -; GFX9-SDAG-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s11, s[6:7] -; GFX9-SDAG-NEXT: v_readlane_b32 s12, v1, s11 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s11 -; GFX9-SDAG-NEXT: s_max_u32 s10, s10, s12 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_4 -; GFX9-SDAG-NEXT: ; %bb.5: +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v1, s[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s9, v2, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v3, s9, 6, v1 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v3 +; GFX9-SDAG-NEXT: v_and_b32_e32 v3, 0x3ff, v31 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v3, v3, 2, 15 +; GFX9-SDAG-NEXT: v_and_b32_e32 v3, 0x1ff0, v3 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v3, s[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v2, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-SDAG-NEXT: s_mov_b32 s6, s32 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s10, 6, v1 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 3 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-SDAG-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v3, s8, 6, v3 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v3 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-SDAG-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: .LBB14_6: ; %bb.1 +; GFX9-SDAG-NEXT: .LBB14_2: ; %bb.1 ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s6, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s7, v0, s6 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s6 -; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s7 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB14_7 -; GFX9-SDAG-NEXT: ; %bb.8: +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s33 @@ -1901,255 +2367,339 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX9-SDAG-NEXT: s_mov_b32 s34, s14 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s13 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s11 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s33, s10 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas: ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s13, s33 +; GFX9-GISEL-NEXT: s_mov_b32 s11, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0 -; GFX9-GISEL-NEXT: s_mov_b32 s14, s34 -; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s12, s34 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x3000 ; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-GISEL-NEXT: s_cbranch_execz .LBB14_6 +; GFX9-GISEL-NEXT: s_cbranch_execz .LBB14_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.0 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v31 +; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0x3ff, v31 ; GFX9-GISEL-NEXT: v_and_b32_e32 v1, -16, v1 -; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9-GISEL-NEXT: s_mov_b32 s9, 0 -; GFX9-GISEL-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX9-GISEL-NEXT: v_readlane_b32 s11, v1, s10 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX9-GISEL-NEXT: s_max_u32 s9, s9, s11 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB14_2 -; GFX9-GISEL-NEXT: ; %bb.3: +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v1, s[6:7] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v2, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-GISEL-NEXT: s_add_u32 s7, s32, 0xfff -; GFX9-GISEL-NEXT: s_lshl_b32 s6, s9, 6 -; GFX9-GISEL-NEXT: s_and_b32 s9, s7, 0xfffff000 -; GFX9-GISEL-NEXT: v_lshl_add_u32 v1, v2, 2, 15 -; GFX9-GISEL-NEXT: s_add_u32 s32, s9, s6 +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s8, 6 +; GFX9-GISEL-NEXT: s_and_b32 s8, s7, 0xfffff000 +; GFX9-GISEL-NEXT: s_add_u32 s32, s8, s6 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v1, v3, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s9, s32 ; GFX9-GISEL-NEXT: v_and_b32_e32 v1, -16, v1 -; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9-GISEL-NEXT: s_mov_b32 s10, 0 -; GFX9-GISEL-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s11, s[6:7] -; GFX9-GISEL-NEXT: v_readlane_b32 s12, v1, s11 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s11 -; GFX9-GISEL-NEXT: s_max_u32 s10, s10, s12 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB14_4 -; GFX9-GISEL-NEXT: ; %bb.5: -; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v1, s[6:7] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s10, v2, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 3 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-GISEL-NEXT: s_lshl_b32 s7, s10, 6 -; GFX9-GISEL-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s10, 6 +; GFX9-GISEL-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s7 -; GFX9-GISEL-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-GISEL-NEXT: s_add_u32 s32, s9, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: .LBB14_6: ; %bb.1 +; GFX9-GISEL-NEXT: .LBB14_2: ; %bb.1 ; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s6, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s7, v0, s6 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s6 -; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s7 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB14_7 -; GFX9-GISEL-NEXT: ; %bb.8: -; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s5, s8, 6 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s7, v2, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s7, 6 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s4 ; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 -; GFX9-GISEL-NEXT: s_mov_b32 s34, s14 -; GFX9-GISEL-NEXT: s_mov_b32 s33, s13 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s12 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s33, s11 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_multiple_allocas: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s7, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s5, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 -; GFX11-SDAG-NEXT: s_mov_b32 s8, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 offset:64 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s33 offset:68 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v4, s33 offset:72 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v5, s33 offset:76 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s6, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0xc0 ; GFX11-SDAG-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_6 +; GFX11-SDAG-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.0 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff -; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo -; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 -; GFX11-SDAG-NEXT: s_mov_b32 s3, 0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 -; GFX11-SDAG-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s5, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s6, v1, s5 -; GFX11-SDAG-NEXT: s_bitset0_b32 s4, s5 -; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s6 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_2 -; GFX11-SDAG-NEXT: ; %bb.3: -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v31 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v2, s3, 5, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s4, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s3, 0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v6, 0x3ff, v31 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v2 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, -16, v1 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v6, v6, 2, 15 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v1, s1 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v6 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s1 +; GFX11-SDAG-NEXT: s_add_i32 s3, s32, 0x7ff +; GFX11-SDAG-NEXT: s_and_b32 s3, s3, 0xfffff800 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v4, v2 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v5, v3 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-SDAG-NEXT: v_max_u32_e32 v2, v2, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s2, v2, 31 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v2, v3, v5 +; GFX11-SDAG-NEXT: v_readlane_b32 s4, v2, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s2, 5, s3 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x1ff0, v1 -; GFX11-SDAG-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s5, s4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s6, v1, s5 -; GFX11-SDAG-NEXT: s_bitset0_b32 s4, s5 -; GFX11-SDAG-NEXT: s_max_u32 s3, s3, s6 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_4 -; GFX11-SDAG-NEXT: ; %bb.5: -; GFX11-SDAG-NEXT: s_mov_b32 s4, s32 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s3, 5, s4 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s2 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s4 dlc -; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 -; GFX11-SDAG-NEXT: .LBB14_6: ; %bb.1 -; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v6, s3 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v7, s1 dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v1, s4, 5, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v1 +; GFX11-SDAG-NEXT: .LBB14_2: ; %bb.1 +; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-SDAG-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB14_7 -; GFX11-SDAG-NEXT: ; %bb.8: -; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v2, v2, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v2, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, s32 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v6, 2 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 ; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s1 dlc +; GFX11-SDAG-NEXT: scratch_store_b32 off, v6, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s7 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s34, s8 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s6 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-SDAG-NEXT: scratch_load_b32 v2, off, s33 offset:64 +; GFX11-SDAG-NEXT: scratch_load_b32 v3, off, s33 offset:68 +; GFX11-SDAG-NEXT: scratch_load_b32 v4, off, s33 offset:72 +; GFX11-SDAG-NEXT: scratch_load_b32 v5, off, s33 offset:76 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s5 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s7, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63 -; GFX11-GISEL-NEXT: s_mov_b32 s8, s34 -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Spill +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:64 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v3, s33 offset:68 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v4, s33 offset:72 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v5, s33 offset:76 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 +; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0xc0 ; GFX11-GISEL-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-GISEL-NEXT: s_cbranch_execz .LBB14_6 +; GFX11-GISEL-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.0 -; GFX11-GISEL-NEXT: v_lshl_add_u32 v2, v1, 2, 15 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v31 -; GFX11-GISEL-NEXT: s_mov_b32 s3, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_and_b32_e32 v2, -16, v2 -; GFX11-GISEL-NEXT: .LBB14_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s4, s3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s5, v2, s4 -; GFX11-GISEL-NEXT: s_bitset0_b32 s3, s4 -; GFX11-GISEL-NEXT: s_max_u32 s2, s2, s5 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s3, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_2 -; GFX11-GISEL-NEXT: ; %bb.3: +; GFX11-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v31 ; GFX11-GISEL-NEXT: v_lshl_add_u32 v1, v1, 2, 15 -; GFX11-GISEL-NEXT: s_lshl_b32 s5, s2, 5 -; GFX11-GISEL-NEXT: s_add_u32 s2, s32, 0x7ff -; GFX11-GISEL-NEXT: s_mov_b32 s4, exec_lo -; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xfffff800 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_lshl_add_u32 v6, v6, 2, 15 ; GFX11-GISEL-NEXT: v_and_b32_e32 v1, -16, v1 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0 -; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s5 -; GFX11-GISEL-NEXT: .LBB14_4: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s5, s4 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s6, v1, s5 -; GFX11-GISEL-NEXT: s_bitset0_b32 s4, s5 -; GFX11-GISEL-NEXT: s_max_u32 s3, s3, s6 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_4 -; GFX11-GISEL-NEXT: ; %bb.5: -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v2, 4 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s3, s3, 5 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s2 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s4 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_u32 s32, s4, s3 -; GFX11-GISEL-NEXT: .LBB14_6: ; %bb.1 -; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB14_7: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB14_7 -; GFX11-GISEL-NEXT: ; %bb.8: -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 -; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_and_b32_e32 v6, -16, v6 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v1, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, v6, s1 +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_max_u32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v4, v2 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: ds_swizzle_b32 v5, v3 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-GISEL-NEXT: v_max_u32_e32 v2, v2, v4 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v3, v3, v5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v2, 31 +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v3, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0x7ff +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 3 :: v_dual_mov_b32 v6, 4 +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff800 +; GFX11-GISEL-NEXT: s_lshl_b32 s2, s2, 5 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s2 ; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 +; GFX11-GISEL-NEXT: s_lshl_b32 s2, s3, 5 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v6, s1 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s2 +; GFX11-GISEL-NEXT: .LBB14_2: ; %bb.1 +; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v2, v2, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v2, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s2, 5 +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s33 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s0 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 -; GFX11-GISEL-NEXT: s_mov_b32 s34, s8 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s7 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x3 ; 16-byte Folded Reload +; GFX11-GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:64 +; GFX11-GISEL-NEXT: scratch_load_b32 v3, off, s33 offset:68 +; GFX11-GISEL-NEXT: scratch_load_b32 v4, off, s33 offset:72 +; GFX11-GISEL-NEXT: scratch_load_b32 v5, off, s33 offset:76 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %n, 0 @@ -2173,55 +2723,70 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s12, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s33 ; GFX9-SDAG-NEXT: s_add_i32 s33, s32, 0xfc0 -; GFX9-SDAG-NEXT: s_mov_b32 s13, s34 -; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 -; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s11, s34 +; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x2000 ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_2 ; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1 ; GFX9-SDAG-NEXT: s_add_i32 s6, s32, 0xfff ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15 -; GFX9-SDAG-NEXT: s_and_b32 s9, s6, 0xfffff000 +; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0xfffff000 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec -; GFX9-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s10, s[6:7] -; GFX9-SDAG-NEXT: v_readlane_b32 s11, v0, s10 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s10 -; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s11 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_2 -; GFX9-SDAG-NEXT: ; %bb.3: -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s9 -; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s8, 6, v0 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s9, v2, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-SDAG-NEXT: v_lshl_add_u32 v1, s9, 6, v0 ; GFX9-SDAG-NEXT: v_readfirstlane_b32 s32, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: ; implicit-def: $vgpr31 -; GFX9-SDAG-NEXT: .LBB15_4: ; %Flow +; GFX9-SDAG-NEXT: .LBB15_2: ; %Flow ; GFX9-SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_8 -; GFX9-SDAG-NEXT: ; %bb.5: ; %bb.0 +; GFX9-SDAG-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-SDAG-NEXT: ; %bb.3: ; %bb.0 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], exec -; GFX9-SDAG-NEXT: s_mov_b32 s8, 0 -; GFX9-SDAG-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s9, s[6:7] -; GFX9-SDAG-NEXT: v_readlane_b32 s10, v0, s9 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[6:7], s9 -; GFX9-SDAG-NEXT: s_max_u32 s8, s8, s10 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB15_6 -; GFX9-SDAG-NEXT: ; %bb.7: +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s8, v2, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-SDAG-NEXT: s_mov_b32 s6, s32 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s8, 6, v0 @@ -2229,39 +2794,52 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: .LBB15_8: ; %bb.2 +; GFX9-SDAG-NEXT: .LBB15_4: ; %bb.2 ; GFX9-SDAG-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX9-SDAG-NEXT: s_mov_b32 s34, s13 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s12 +; GFX9-SDAG-NEXT: s_mov_b32 s34, s11 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s33, s10 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow: ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: s_mov_b32 s11, s33 +; GFX9-GISEL-NEXT: s_mov_b32 s10, s33 ; GFX9-GISEL-NEXT: s_add_i32 s33, s32, 0xfc0 -; GFX9-GISEL-NEXT: s_mov_b32 s12, s34 -; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 -; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_and_b32 s33, s33, 0xfffff000 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s11, s34 +; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x2000 ; GFX9-GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-GISEL-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-GISEL-NEXT: s_cbranch_execz .LBB15_2 ; GFX9-GISEL-NEXT: ; %bb.1: ; %bb.1 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v1, 2, 15 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9-GISEL-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7] -; GFX9-GISEL-NEXT: v_readlane_b32 s10, v0, s9 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s9 -; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s10 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB15_2 -; GFX9-GISEL-NEXT: ; %bb.3: +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[6:7] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s8, v2, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-GISEL-NEXT: s_add_u32 s7, s32, 0xfff ; GFX9-GISEL-NEXT: s_and_b32 s7, s7, 0xfffff000 ; GFX9-GISEL-NEXT: s_lshl_b32 s6, s8, 6 @@ -2271,174 +2849,222 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) { ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: ; implicit-def: $vgpr31 -; GFX9-GISEL-NEXT: .LBB15_4: ; %Flow +; GFX9-GISEL-NEXT: .LBB15_2: ; %Flow ; GFX9-GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-GISEL-NEXT: s_cbranch_execz .LBB15_8 -; GFX9-GISEL-NEXT: ; %bb.5: ; %bb.0 +; GFX9-GISEL-NEXT: s_cbranch_execz .LBB15_4 +; GFX9-GISEL-NEXT: ; %bb.3: ; %bb.0 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s8, s32 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9-GISEL-NEXT: s_mov_b32 s8, 0 -; GFX9-GISEL-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s9, s[6:7] -; GFX9-GISEL-NEXT: v_readlane_b32 s10, v0, s9 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[6:7], s9 -; GFX9-GISEL-NEXT: s_max_u32 s8, s8, s10 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB15_6 -; GFX9-GISEL-NEXT: ; %bb.7: -; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s7, s8, 6 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[6:7] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s9, v2, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s9, 6 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s7 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-GISEL-NEXT: s_add_u32 s32, s8, s6 ; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: .LBB15_8: ; %bb.2 +; GFX9-GISEL-NEXT: .LBB15_4: ; %bb.2 ; GFX9-GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-GISEL-NEXT: s_mov_b32 s32, s34 -; GFX9-GISEL-NEXT: s_mov_b32 s34, s12 -; GFX9-GISEL-NEXT: s_mov_b32 s33, s11 +; GFX9-GISEL-NEXT: s_mov_b32 s34, s11 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s33, s10 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_control_flow: ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s6, s33 +; GFX11-SDAG-NEXT: s_mov_b32 s3, s33 ; GFX11-SDAG-NEXT: s_add_i32 s33, s32, 63 -; GFX11-SDAG-NEXT: s_mov_b32 s7, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 -; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s4, s34 +; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s34, s32 ; GFX11-SDAG-NEXT: s_addk_i32 s32, 0x80 ; GFX11-SDAG-NEXT: v_cmpx_ne_u32_e32 0, v0 ; GFX11-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_4 +; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_2 ; GFX11-SDAG-NEXT: ; %bb.1: ; %bb.1 ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15 -; GFX11-SDAG-NEXT: s_add_i32 s2, s32, 0x7ff -; GFX11-SDAG-NEXT: s_mov_b32 s3, exec_lo -; GFX11-SDAG-NEXT: s_and_b32 s2, s2, 0xfffff800 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-SDAG-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s4, s3 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s5, v0, s4 -; GFX11-SDAG-NEXT: s_bitset0_b32 s3, s4 -; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s5 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s3, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_2 -; GFX11-SDAG-NEXT: ; %bb.3: -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v0, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v2, v2, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s2, v2, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-SDAG-NEXT: s_add_i32 s1, s32, 0x7ff ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-SDAG-NEXT: s_and_b32 s1, s1, 0xfffff800 ; GFX11-SDAG-NEXT: ; implicit-def: $vgpr31 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s2, 5, s1 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: .LBB15_4: ; %Flow +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 +; GFX11-SDAG-NEXT: .LBB15_2: ; %Flow ; GFX11-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_8 -; GFX11-SDAG-NEXT: ; %bb.5: ; %bb.0 +; GFX11-SDAG-NEXT: s_cbranch_execz .LBB15_4 +; GFX11-SDAG-NEXT: ; %bb.3: ; %bb.0 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-SDAG-NEXT: s_mov_b32 s2, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s1, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0 -; GFX11-SDAG-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s4, v0, s3 -; GFX11-SDAG-NEXT: s_bitset0_b32 s2, s3 -; GFX11-SDAG-NEXT: s_max_u32 s1, s1, s4 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB15_6 -; GFX11-SDAG-NEXT: ; %bb.7: -; GFX11-SDAG-NEXT: s_mov_b32 s2, s32 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v0, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v2, v2, v3 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s2, v2, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s2 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s2 dlc +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s2, 5, s1 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 -; GFX11-SDAG-NEXT: .LBB15_8: ; %bb.2 +; GFX11-SDAG-NEXT: .LBB15_4: ; %bb.2 ; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s34 -; GFX11-SDAG-NEXT: s_mov_b32 s34, s7 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s6 +; GFX11-SDAG-NEXT: s_mov_b32 s34, s4 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-SDAG-NEXT: scratch_load_b32 v2, off, s33 +; GFX11-SDAG-NEXT: scratch_load_b32 v3, off, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s3 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_control_flow: ; GFX11-GISEL: ; %bb.0: ; %entry ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_mov_b32 s5, s33 +; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 ; GFX11-GISEL-NEXT: s_add_i32 s33, s32, 63 -; GFX11-GISEL-NEXT: s_mov_b32 s6, s34 -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0 -; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: s_and_not1_b32 s33, s33, 63 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s33 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v3, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s5, s34 +; GFX11-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX11-GISEL-NEXT: s_mov_b32 s34, s32 ; GFX11-GISEL-NEXT: s_addk_i32 s32, 0x80 ; GFX11-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v0 ; GFX11-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-GISEL-NEXT: s_cbranch_execz .LBB15_4 +; GFX11-GISEL-NEXT: s_cbranch_execz .LBB15_2 ; GFX11-GISEL-NEXT: ; %bb.1: ; %bb.1 ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v1, 2, 15 -; GFX11-GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB15_2: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3 -; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3 -; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_2 -; GFX11-GISEL-NEXT: ; %bb.3: +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v2, v2, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v2, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 2 -; GFX11-GISEL-NEXT: s_add_u32 s2, s32, 0x7ff -; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5 -; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xfffff800 +; GFX11-GISEL-NEXT: s_add_u32 s1, s32, 0x7ff +; GFX11-GISEL-NEXT: s_lshl_b32 s2, s2, 5 +; GFX11-GISEL-NEXT: s_and_b32 s1, s1, 0xfffff800 ; GFX11-GISEL-NEXT: ; implicit-def: $vgpr31 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s1 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s2 dlc +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s2 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: .LBB15_4: ; %Flow +; GFX11-GISEL-NEXT: .LBB15_2: ; %Flow ; GFX11-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-GISEL-NEXT: s_cbranch_execz .LBB15_8 -; GFX11-GISEL-NEXT: ; %bb.5: ; %bb.0 +; GFX11-GISEL-NEXT: s_cbranch_execz .LBB15_4 +; GFX11-GISEL-NEXT: ; %bb.3: ; %bb.0 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s1, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB15_6: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s3, s2 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s4, v0, s3 -; GFX11-GISEL-NEXT: s_bitset0_b32 s2, s3 -; GFX11-GISEL-NEXT: s_max_u32 s1, s1, s4 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB15_6 -; GFX11-GISEL-NEXT: ; %bb.7: +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, v0, s2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v3, v2 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v2, v2, v3 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s3, v2, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 1 -; GFX11-GISEL-NEXT: s_mov_b32 s2, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s1, s1, 5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s2, s1 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s2 dlc +; GFX11-GISEL-NEXT: s_lshl_b32 s2, s3, 5 +; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s2 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-GISEL-NEXT: .LBB15_8: ; %bb.2 +; GFX11-GISEL-NEXT: .LBB15_4: ; %bb.2 ; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-GISEL-NEXT: s_mov_b32 s32, s34 -; GFX11-GISEL-NEXT: s_mov_b32 s34, s6 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s5 +; GFX11-GISEL-NEXT: s_mov_b32 s34, s5 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-GISEL-NEXT: scratch_load_b32 v2, off, s33 +; GFX11-GISEL-NEXT: scratch_load_b32 v3, off, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %n, 0 @@ -2460,22 +3086,31 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s7, s33 +; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 -; GFX9-SDAG-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s6, v1, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 @@ -2484,101 +3119,148 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s33, s7 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s8, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 -; GFX9-GISEL-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX9-GISEL-NEXT: ; %bb.2: -; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 -; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s7, v1, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s7, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s4 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s33, s8 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s33 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v3, 2, 15 ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0 -; GFX11-SDAG-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v1, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x29a +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-SDAG-NEXT: scratch_load_b32 v1, off, s33 +; GFX11-SDAG-NEXT: scratch_load_b32 v2, off, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s2 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s3, s33 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 -; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s33 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX11-GISEL-NEXT: .LBB16_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB16_1 -; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v1, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a -; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s2, 5 +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-GISEL-NEXT: scratch_load_b32 v1, off, s33 +; GFX11-GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s3 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i16 %n, align 2, addrspace(5) store volatile i32 666, ptr addrspace(5) %alloca @@ -2589,21 +3271,30 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX9-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64: ; GFX9-SDAG: ; %bb.0: ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s33 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-SDAG-NEXT: s_mov_b64 s[4:5], exec -; GFX9-SDAG-NEXT: s_mov_b32 s6, 0 +; GFX9-SDAG-NEXT: s_mov_b32 s7, s33 ; GFX9-SDAG-NEXT: s_mov_b32 s33, s32 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX9-SDAG-NEXT: s_addk_i32 s32, 0x400 -; GFX9-SDAG-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-SDAG-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-SDAG-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-SDAG-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-SDAG-NEXT: s_max_u32 s6, s6, s8 -; GFX9-SDAG-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-SDAG-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX9-SDAG-NEXT: ; %bb.2: +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-SDAG-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-SDAG-NEXT: v_readlane_b32 s6, v1, 63 +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-SDAG-NEXT: s_mov_b32 s4, s32 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s6, 6, v0 @@ -2612,95 +3303,143 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX9-SDAG-NEXT: s_mov_b32 s33, s9 +; GFX9-SDAG-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-SDAG-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-SDAG-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-SDAG-NEXT: s_mov_b32 s33, s7 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX9-GISEL-NEXT: s_mov_b32 s9, s33 -; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GFX9-GISEL-NEXT: s_mov_b64 s[4:5], exec -; GFX9-GISEL-NEXT: s_mov_b32 s6, 0 +; GFX9-GISEL-NEXT: s_mov_b32 s8, s33 ; GFX9-GISEL-NEXT: s_mov_b32 s33, s32 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-GISEL-NEXT: s_addk_i32 s32, 0x400 -; GFX9-GISEL-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s7, s[4:5] -; GFX9-GISEL-NEXT: v_readlane_b32 s8, v0, s7 -; GFX9-GISEL-NEXT: s_bitset0_b64 s[4:5], s7 -; GFX9-GISEL-NEXT: s_max_u32 s6, s6, s8 -; GFX9-GISEL-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-GISEL-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX9-GISEL-NEXT: ; %bb.2: -; GFX9-GISEL-NEXT: s_mov_b32 s4, s32 -; GFX9-GISEL-NEXT: s_lshl_b32 s5, s6, 6 -; GFX9-GISEL-NEXT: s_add_u32 s32, s4, s5 +; GFX9-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX9-GISEL-NEXT: s_mov_b32 s6, s32 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX9-GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5] +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: s_nop 1 +; GFX9-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9-GISEL-NEXT: v_readlane_b32 s7, v1, 63 +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s7, 6 +; GFX9-GISEL-NEXT: s_add_u32 s32, s6, s4 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-GISEL-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX9-GISEL-NEXT: s_mov_b32 s33, s9 +; GFX9-GISEL-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-GISEL-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-GISEL-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-GISEL-NEXT: s_mov_b32 s33, s8 +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo -; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 +; GFX11-SDAG-NEXT: s_mov_b32 s2, s33 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s33 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v2, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 -; GFX11-SDAG-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-SDAG-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-SDAG-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-SDAG-NEXT: s_bitset0_b32 s1, s2 -; GFX11-SDAG-NEXT: s_max_u32 s0, s0, s3 -; GFX11-SDAG-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-SDAG-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX11-SDAG-NEXT: ; %bb.2: -; GFX11-SDAG-NEXT: s_mov_b32 s1, s32 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x29a -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, s1 -; GFX11-SDAG-NEXT: scratch_store_b32 off, v1, s1 dlc +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-SDAG-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readlane_b32 s1, v1, 31 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s0, s32 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 0x29a +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, s1, 5, s0 +; GFX11-SDAG-NEXT: scratch_store_b32 off, v3, s0 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s32, v0 ; GFX11-SDAG-NEXT: s_mov_b32 s32, s33 -; GFX11-SDAG-NEXT: s_mov_b32 s33, s4 +; GFX11-SDAG-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-SDAG-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-SDAG-NEXT: scratch_load_b32 v1, off, s33 +; GFX11-SDAG-NEXT: scratch_load_b32 v2, off, s33 offset:4 +; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-SDAG-NEXT: s_mov_b32 s33, s2 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64: ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 -; GFX11-GISEL-NEXT: s_mov_b32 s4, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s1, exec_lo -; GFX11-GISEL-NEXT: s_mov_b32 s0, 0 +; GFX11-GISEL-NEXT: s_mov_b32 s3, s33 ; GFX11-GISEL-NEXT: s_mov_b32 s33, s32 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GFX11-GISEL-NEXT: scratch_store_b32 off, v1, s33 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: v_lshl_add_u32 v0, v0, 2, 15 ; GFX11-GISEL-NEXT: s_add_i32 s32, s32, 16 -; GFX11-GISEL-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1 -; GFX11-GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: s_mov_b32 s0, s32 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, -16, v0 +; GFX11-GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GFX11-GISEL-NEXT: s_bitset0_b32 s1, s2 -; GFX11-GISEL-NEXT: s_max_u32 s0, s0, s3 -; GFX11-GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-GISEL-NEXT: s_cbranch_scc1 .LBB17_1 -; GFX11-GISEL-NEXT: ; %bb.2: +; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s1 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX11-GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: v_readlane_b32 s2, v1, 31 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x29a -; GFX11-GISEL-NEXT: s_mov_b32 s1, s32 -; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: s_add_u32 s32, s1, s0 -; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s1 dlc -; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_lshl_b32 s1, s2, 5 +; GFX11-GISEL-NEXT: s_add_u32 s32, s0, s1 ; GFX11-GISEL-NEXT: s_mov_b32 s32, s33 -; GFX11-GISEL-NEXT: s_mov_b32 s33, s4 +; GFX11-GISEL-NEXT: scratch_store_b32 off, v0, s0 dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GFX11-GISEL-NEXT: scratch_load_b32 v1, off, s33 +; GFX11-GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:4 +; GFX11-GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-GISEL-NEXT: s_mov_b32 s33, s3 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, i64 %n, align 2, addrspace(5) store volatile i32 666, ptr addrspace(5) %alloca diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll index e58bf6280a1f..d7fa9bc80063 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll @@ -463,6 +463,911 @@ entry: ret void } +define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +; +; GFX12DAGISEL-LABEL: uniform_value_dpp: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12DAGISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %in, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +; +; GFX12DAGISEL-LABEL: divergent_value_dpp: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX12DAGISEL-NEXT: s_wait_dscnt 0x0 +; GFX12DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX12DAGISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: default_stratergy: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: default_stratergy: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: default_stratergy: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: default_stratergy: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: default_stratergy: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: default_stratergy: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: default_stratergy: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: default_stratergy: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: default_stratergy: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: default_stratergy: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: default_stratergy: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: default_stratergy: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +; +; GFX12DAGISEL-LABEL: default_stratergy: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX12DAGISEL-NEXT: s_wait_dscnt 0x0 +; GFX12DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX12DAGISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.add.i32(i32 %id.x, i32 0) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -470,7 +1375,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else ; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -478,24 +1383,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX8DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX8DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_add_i32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -510,7 +1415,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -518,20 +1423,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB5_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_add_i32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX8GISEL-NEXT: .LBB2_5: ; %endif +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX8GISEL-NEXT: .LBB5_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -547,7 +1452,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else ; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -555,24 +1460,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX9DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX9DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_add_i32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -586,7 +1491,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -594,20 +1499,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB5_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_add_i32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX9GISEL-NEXT: .LBB2_5: ; %endif +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX9GISEL-NEXT: .LBB5_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -622,7 +1527,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -630,24 +1535,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX1064DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_add_i32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -661,7 +1566,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -669,20 +1574,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_add_i32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1064GISEL-NEXT: .LBB5_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -697,7 +1602,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -705,24 +1610,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX1032DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_add_i32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -736,7 +1641,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo @@ -744,20 +1649,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_add_i32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1032GISEL-NEXT: .LBB2_5: ; %endif +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1032GISEL-NEXT: .LBB5_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -774,7 +1679,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -783,25 +1688,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX1164DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_add_i32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -817,7 +1722,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -826,21 +1731,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_add_i32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1164GISEL-NEXT: .LBB5_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -857,7 +1762,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -866,25 +1771,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX1132DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_add_i32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -900,7 +1805,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo @@ -909,21 +1814,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_add_i32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1132GISEL-NEXT: .LBB2_5: ; %endif +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1132GISEL-NEXT: .LBB5_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 @@ -939,7 +1844,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX12DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX12DAGISEL-NEXT: ; %bb.1: ; %else ; GFX12DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -948,15 +1853,15 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s2, s2 ; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX12DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX12DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX12DAGISEL-NEXT: ; %bb.3: ; %if ; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX12DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX12DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 @@ -964,10 +1869,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX12DAGISEL-NEXT: s_add_co_i32 s1, s1, s6 ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX12DAGISEL-NEXT: ; %bb.5: ; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX12DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX12DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -1215,7 +2120,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1223,7 +2128,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8 ; GFX8DAGISEL-NEXT: s_addc_u32 s5, s5, s10 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1236,7 +2141,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX8GISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX8GISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1244,7 +2149,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s8 ; GFX8GISEL-NEXT: s_addc_u32 s5, s5, s10 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1257,7 +2162,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1265,7 +2170,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8 ; GFX9DAGISEL-NEXT: s_addc_u32 s5, s5, s10 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1278,7 +2183,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX9GISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX9GISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1286,7 +2191,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s8 ; GFX9GISEL-NEXT: s_addc_u32 s5, s5, s10 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1299,7 +2204,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1307,7 +2212,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_add_u32 s4, s4, s9 ; GFX1064DAGISEL-NEXT: s_addc_u32 s5, s5, s10 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1319,7 +2224,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX1064GISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX1064GISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1327,7 +2232,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_add_u32 s4, s4, s9 ; GFX1064GISEL-NEXT: s_addc_u32 s5, s5, s10 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1339,7 +2244,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7 @@ -1347,7 +2252,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_add_u32 s4, s4, s8 ; GFX1032DAGISEL-NEXT: s_addc_u32 s5, s5, s9 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1359,7 +2264,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7 ; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7 @@ -1367,7 +2272,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_add_u32 s4, s4, s8 ; GFX1032GISEL-NEXT: s_addc_u32 s5, s5, s9 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1379,7 +2284,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s4, s[2:3] ; GFX1164DAGISEL-NEXT: v_readlane_b32 s5, v2, s4 @@ -1388,7 +2293,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_add_u32 s0, s0, s5 ; GFX1164DAGISEL-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1400,7 +2305,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s4, s[2:3] ; GFX1164GISEL-NEXT: v_readlane_b32 s5, v2, s4 @@ -1409,7 +2314,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_add_u32 s0, s0, s5 ; GFX1164GISEL-NEXT: s_addc_u32 s1, s1, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1421,7 +2326,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 @@ -1430,7 +2335,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_add_u32 s0, s0, s4 ; GFX1132DAGISEL-NEXT: s_addc_u32 s1, s1, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1441,7 +2346,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3 @@ -1450,7 +2355,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_add_u32 s0, s0, s4 ; GFX1132GISEL-NEXT: s_addc_u32 s1, s1, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1465,7 +2370,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX12DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX12DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -1475,7 +2380,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX12DAGISEL-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12DAGISEL-NEXT: ; %bb.2: ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 @@ -1496,7 +2401,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7] @@ -1505,7 +2410,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s7 ; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, s7 ; GFX8DAGISEL-NEXT: s_add_u32 s7, s2, s3 -; GFX8DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX8DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -1534,7 +2439,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7] @@ -1543,10 +2448,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: s_mul_hi_u32 s2, s2, s7 ; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s7 ; GFX8GISEL-NEXT: s_add_u32 s7, s2, s3 -; GFX8GISEL-NEXT: .LBB5_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec @@ -1556,7 +2461,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: s_mul_hi_u32 s4, s4, s7 ; GFX8GISEL-NEXT: s_mul_i32 s5, s5, s7 ; GFX8GISEL-NEXT: s_add_u32 s7, s4, s5 -; GFX8GISEL-NEXT: .LBB5_4: ; %endif +; GFX8GISEL-NEXT: .LBB8_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1573,7 +2478,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s5, s[4:5] @@ -1582,7 +2487,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9DAGISEL-NEXT: s_mul_hi_u32 s2, s2, s5 ; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5 ; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s3 -; GFX9DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX9DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -1610,7 +2515,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s7, s[6:7] @@ -1619,10 +2524,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: s_mul_hi_u32 s2, s2, s7 ; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s7 ; GFX9GISEL-NEXT: s_add_u32 s7, s2, s3 -; GFX9GISEL-NEXT: .LBB5_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -1632,7 +2537,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: s_mul_hi_u32 s5, s8, s4 ; GFX9GISEL-NEXT: s_mul_i32 s4, s9, s4 ; GFX9GISEL-NEXT: s_add_u32 s7, s5, s4 -; GFX9GISEL-NEXT: .LBB5_4: ; %endif +; GFX9GISEL-NEXT: .LBB8_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1649,7 +2554,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1064DAGISEL-NEXT: s_mov_b64 s[8:9], exec ; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9] @@ -1658,7 +2563,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064DAGISEL-NEXT: s_mul_i32 s3, s3, s8 ; GFX1064DAGISEL-NEXT: s_mul_i32 s8, s2, s8 ; GFX1064DAGISEL-NEXT: s_add_u32 s9, s9, s3 -; GFX1064DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[4:5] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s8 @@ -1686,7 +2591,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1695,10 +2600,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: s_mul_i32 s3, s3, s6 ; GFX1064GISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX1064GISEL-NEXT: s_add_u32 s7, s7, s3 -; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -1708,7 +2613,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: s_mul_i32 s7, s7, s4 ; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4 ; GFX1064GISEL-NEXT: s_add_u32 s7, s5, s7 -; GFX1064GISEL-NEXT: .LBB5_4: ; %endif +; GFX1064GISEL-NEXT: .LBB8_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1725,7 +2630,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1734,7 +2639,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032DAGISEL-NEXT: s_mul_i32 s3, s3, s4 ; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s4 ; GFX1032DAGISEL-NEXT: s_add_u32 s5, s5, s3 -; GFX1032DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -1762,7 +2667,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6 @@ -1771,10 +2676,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: s_mul_i32 s3, s3, s6 ; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX1032GISEL-NEXT: s_add_u32 s7, s7, s3 -; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo @@ -1784,7 +2689,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: s_mul_i32 s5, s7, s3 ; GFX1032GISEL-NEXT: s_mul_i32 s6, s6, s3 ; GFX1032GISEL-NEXT: s_add_u32 s7, s4, s5 -; GFX1032GISEL-NEXT: .LBB5_4: ; %endif +; GFX1032GISEL-NEXT: .LBB8_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1803,7 +2708,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1813,7 +2718,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164DAGISEL-NEXT: s_mul_i32 s3, s3, s8 ; GFX1164DAGISEL-NEXT: s_mul_i32 s8, s2, s8 ; GFX1164DAGISEL-NEXT: s_add_u32 s9, s9, s3 -; GFX1164DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s8 @@ -1845,7 +2750,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1855,10 +2760,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_mul_i32 s3, s3, s6 ; GFX1164GISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX1164GISEL-NEXT: s_add_u32 s7, s7, s3 -; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec @@ -1869,7 +2774,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_mul_i32 s5, s5, s6 ; GFX1164GISEL-NEXT: s_mul_i32 s6, s4, s6 ; GFX1164GISEL-NEXT: s_add_u32 s7, s7, s5 -; GFX1164GISEL-NEXT: .LBB5_4: ; %endif +; GFX1164GISEL-NEXT: .LBB8_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1888,7 +2793,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1898,7 +2803,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132DAGISEL-NEXT: s_mul_i32 s3, s3, s6 ; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX1132DAGISEL-NEXT: s_add_u32 s7, s7, s3 -; GFX1132DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1928,7 +2833,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1938,10 +2843,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_mul_i32 s3, s3, s6 ; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX1132GISEL-NEXT: s_add_u32 s7, s7, s3 -; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo @@ -1952,7 +2857,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_mul_i32 s5, s5, s3 ; GFX1132GISEL-NEXT: s_mul_i32 s6, s4, s3 ; GFX1132GISEL-NEXT: s_add_u32 s7, s7, s5 -; GFX1132GISEL-NEXT: .LBB5_4: ; %endif +; GFX1132GISEL-NEXT: .LBB8_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -1970,7 +2875,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX12DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX12DAGISEL-NEXT: ; %bb.1: ; %else ; GFX12DAGISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1980,7 +2885,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX12DAGISEL-NEXT: s_mul_i32 s3, s3, s6 ; GFX12DAGISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX12DAGISEL-NEXT: s_add_co_u32 s7, s7, s3 -; GFX12DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX12DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll index f39dd867f958..a767a6ab7282 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll @@ -367,6 +367,784 @@ entry: ret void } +define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value_dpp: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value_dpp: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %in, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1032DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1032GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %id.x, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: default_stratergy: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: default_stratergy: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: default_stratergy: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: default_stratergy: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: default_stratergy: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: default_stratergy: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: default_stratergy: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1032DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: default_stratergy: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1032GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: default_stratergy: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: default_stratergy: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: default_stratergy: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: default_stratergy: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.and.i32(i32 %id.x, i32 0) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -382,20 +1160,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_and_b32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -410,26 +1188,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s6, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB5_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, -1 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_and_b32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX8GISEL-NEXT: .LBB2_5: ; %endif +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX8GISEL-NEXT: .LBB5_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -453,20 +1231,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_and_b32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -480,26 +1258,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s6, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB5_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_and_b32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX9GISEL-NEXT: .LBB2_5: ; %endif +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX9GISEL-NEXT: .LBB5_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -522,20 +1300,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_and_b32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -549,26 +1327,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, -1 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_and_b32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1064GISEL-NEXT: .LBB5_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -591,20 +1369,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_and_b32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -618,26 +1396,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, -1 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_and_b32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1032GISEL-NEXT: .LBB2_5: ; %endif +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1032GISEL-NEXT: .LBB5_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -662,21 +1440,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_and_b32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -692,27 +1470,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, -1 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_and_b32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1164GISEL-NEXT: .LBB5_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -737,21 +1515,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_and_b32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -767,27 +1545,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_and_b32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1132GISEL-NEXT: .LBB2_5: ; %endif +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1132GISEL-NEXT: .LBB5_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 @@ -925,14 +1703,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX8DAGISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -945,14 +1723,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX8GISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX8GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -965,14 +1743,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX9DAGISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -985,14 +1763,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX9GISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX9GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1005,14 +1783,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX1064DAGISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1024,14 +1802,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX1064GISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX1064GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1043,14 +1821,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1062,14 +1840,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7 ; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7 ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1081,7 +1859,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s6, s[2:3] ; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s6 @@ -1089,7 +1867,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s6 ; GFX1164DAGISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1101,7 +1879,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s6, s[2:3] ; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s6 @@ -1109,7 +1887,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s6 ; GFX1164GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1121,7 +1899,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 @@ -1129,7 +1907,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1140,7 +1918,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3 @@ -1148,7 +1926,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1189,19 +1967,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX8GISEL-NEXT: .LBB5_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX8GISEL-NEXT: .LBB5_4: ; %endif +; GFX8GISEL-NEXT: .LBB8_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1238,19 +2016,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9GISEL-NEXT: .LBB5_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX9GISEL-NEXT: .LBB5_4: ; %endif +; GFX9GISEL-NEXT: .LBB8_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1287,19 +2065,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX1064GISEL-NEXT: .LBB5_4: ; %endif +; GFX1064GISEL-NEXT: .LBB8_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1336,19 +2114,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX1032GISEL-NEXT: .LBB5_4: ; %endif +; GFX1032GISEL-NEXT: .LBB8_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1389,19 +2167,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX1164GISEL-NEXT: .LBB5_4: ; %endif +; GFX1164GISEL-NEXT: .LBB8_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1440,19 +2218,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX1132GISEL-NEXT: .LBB5_4: ; %endif +; GFX1132GISEL-NEXT: .LBB8_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll index 6f299ab8bb9c..502d58f66bd3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll @@ -367,6 +367,792 @@ entry: ret void } +define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value_dpp: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value_dpp: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %in, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 +; GFX1032DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 +; GFX1032GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %id.x, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: default_stratergy: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: default_stratergy: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: default_stratergy: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: default_stratergy: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: default_stratergy: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: default_stratergy: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: default_stratergy: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 +; GFX1032DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: default_stratergy: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 +; GFX1032GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: default_stratergy: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: default_stratergy: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: default_stratergy: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: default_stratergy: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0x80000000, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_max_i32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.max.i32(i32 %id.x, i32 0) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -382,20 +1168,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_brev_b32 s6, 1 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -410,26 +1196,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s6, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB5_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_brev_b32 s6, 1 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX8GISEL-NEXT: .LBB2_5: ; %endif +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX8GISEL-NEXT: .LBB5_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -453,20 +1239,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_brev_b32 s6, 1 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -480,26 +1266,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s6, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB5_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_brev_b32 s6, 1 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX9GISEL-NEXT: .LBB2_5: ; %endif +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX9GISEL-NEXT: .LBB5_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -522,20 +1308,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_brev_b32 s6, 1 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -549,26 +1335,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_brev_b32 s6, 1 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1064GISEL-NEXT: .LBB5_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -591,20 +1377,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_brev_b32 s1, 1 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_max_i32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -618,26 +1404,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_brev_b32 s0, 1 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_max_i32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1032GISEL-NEXT: .LBB2_5: ; %endif +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1032GISEL-NEXT: .LBB5_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -662,21 +1448,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_brev_b32 s6, 1 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -692,27 +1478,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_brev_b32 s6, 1 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_max_i32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1164GISEL-NEXT: .LBB5_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -737,21 +1523,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_max_i32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -767,27 +1553,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_brev_b32 s0, 1 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_max_i32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1132GISEL-NEXT: .LBB2_5: ; %endif +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1132GISEL-NEXT: .LBB5_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 @@ -926,7 +1712,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX8DAGISEL-NEXT: s_brev_b32 s5, 1 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -937,7 +1723,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -951,7 +1737,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_mov_b32 s4, 0 ; GFX8GISEL-NEXT: s_brev_b32 s5, 1 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -962,7 +1748,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -976,7 +1762,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX9DAGISEL-NEXT: s_brev_b32 s5, 1 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -987,7 +1773,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1001,7 +1787,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_mov_b32 s4, 0 ; GFX9GISEL-NEXT: s_brev_b32 s5, 1 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1012,7 +1798,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1026,7 +1812,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX1064DAGISEL-NEXT: s_brev_b32 s5, 1 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1037,7 +1823,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1050,7 +1836,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1064GISEL-NEXT: s_brev_b32 s5, 1 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1061,7 +1847,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1074,7 +1860,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, 0 ; GFX1032DAGISEL-NEXT: s_brev_b32 s5, 1 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1085,7 +1871,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1098,7 +1884,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1032GISEL-NEXT: s_brev_b32 s5, 1 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1109,7 +1895,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1122,7 +1908,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_mov_b32 s0, 0 ; GFX1164DAGISEL-NEXT: s_brev_b32 s1, 1 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1134,7 +1920,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1147,7 +1933,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_mov_b32 s0, 0 ; GFX1164GISEL-NEXT: s_brev_b32 s1, 1 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1159,7 +1945,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1172,7 +1958,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0 ; GFX1132DAGISEL-NEXT: s_brev_b32 s1, 1 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -1183,7 +1969,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1195,7 +1981,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 ; GFX1132GISEL-NEXT: s_brev_b32 s1, 1 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -1206,7 +1992,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1247,19 +2033,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX8GISEL-NEXT: .LBB5_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX8GISEL-NEXT: .LBB5_4: ; %endif +; GFX8GISEL-NEXT: .LBB8_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1296,19 +2082,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9GISEL-NEXT: .LBB5_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX9GISEL-NEXT: .LBB5_4: ; %endif +; GFX9GISEL-NEXT: .LBB8_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1345,19 +2131,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX1064GISEL-NEXT: .LBB5_4: ; %endif +; GFX1064GISEL-NEXT: .LBB8_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1394,19 +2180,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX1032GISEL-NEXT: .LBB5_4: ; %endif +; GFX1032GISEL-NEXT: .LBB8_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1447,19 +2233,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX1164GISEL-NEXT: .LBB5_4: ; %endif +; GFX1164GISEL-NEXT: .LBB8_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1498,19 +2284,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX1132GISEL-NEXT: .LBB5_4: ; %endif +; GFX1132GISEL-NEXT: .LBB8_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll index 3c4cbc74aedc..a196fd6388be 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll @@ -367,6 +367,792 @@ entry: ret void } +define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value_dpp: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value_dpp: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %in, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 +; GFX1032DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 +; GFX1032GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %id.x, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: default_stratergy: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: default_stratergy: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: default_stratergy: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: default_stratergy: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_bfrev_b32_e32 v1, -2 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: default_stratergy: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: default_stratergy: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: default_stratergy: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 +; GFX1032DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: default_stratergy: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 +; GFX1032GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: default_stratergy: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: default_stratergy: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: default_stratergy: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: default_stratergy: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0x7fffffff, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_min_i32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.min.i32(i32 %id.x, i32 0) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -382,20 +1168,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_brev_b32 s6, -2 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -410,26 +1196,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s6, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB5_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_brev_b32 s6, -2 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX8GISEL-NEXT: .LBB2_5: ; %endif +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX8GISEL-NEXT: .LBB5_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -453,20 +1239,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_brev_b32 s6, -2 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -480,26 +1266,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s6, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB5_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_brev_b32 s6, -2 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX9GISEL-NEXT: .LBB2_5: ; %endif +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX9GISEL-NEXT: .LBB5_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -522,20 +1308,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_brev_b32 s6, -2 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -549,26 +1335,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_brev_b32 s6, -2 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1064GISEL-NEXT: .LBB5_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -591,20 +1377,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_brev_b32 s1, -2 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_min_i32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -618,26 +1404,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_brev_b32 s0, -2 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_min_i32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1032GISEL-NEXT: .LBB2_5: ; %endif +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1032GISEL-NEXT: .LBB5_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -662,21 +1448,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_brev_b32 s6, -2 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -692,27 +1478,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_brev_b32 s6, -2 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_min_i32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1164GISEL-NEXT: .LBB5_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -737,21 +1523,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_min_i32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -767,27 +1553,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_brev_b32 s0, -2 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_min_i32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1132GISEL-NEXT: .LBB2_5: ; %endif +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1132GISEL-NEXT: .LBB5_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 @@ -926,7 +1712,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX8DAGISEL-NEXT: s_brev_b32 s5, -2 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -937,7 +1723,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -951,7 +1737,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_mov_b32 s4, -1 ; GFX8GISEL-NEXT: s_brev_b32 s5, -2 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -962,7 +1748,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -976,7 +1762,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX9DAGISEL-NEXT: s_brev_b32 s5, -2 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -987,7 +1773,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1001,7 +1787,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_mov_b32 s4, -1 ; GFX9GISEL-NEXT: s_brev_b32 s5, -2 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1012,7 +1798,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1026,7 +1812,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX1064DAGISEL-NEXT: s_brev_b32 s5, -2 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1037,7 +1823,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1050,7 +1836,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1064GISEL-NEXT: s_brev_b32 s5, -2 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1061,7 +1847,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1074,7 +1860,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, -1 ; GFX1032DAGISEL-NEXT: s_brev_b32 s5, -2 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1085,7 +1871,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1098,7 +1884,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_mov_b32 s4, -1 ; GFX1032GISEL-NEXT: s_brev_b32 s5, -2 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1109,7 +1895,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1122,7 +1908,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_mov_b32 s0, -1 ; GFX1164DAGISEL-NEXT: s_brev_b32 s1, -2 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1134,7 +1920,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1147,7 +1933,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_mov_b32 s0, -1 ; GFX1164GISEL-NEXT: s_brev_b32 s1, -2 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1159,7 +1945,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1172,7 +1958,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_mov_b32 s0, -1 ; GFX1132DAGISEL-NEXT: s_brev_b32 s1, -2 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -1183,7 +1969,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1195,7 +1981,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_mov_b32 s0, -1 ; GFX1132GISEL-NEXT: s_brev_b32 s1, -2 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -1206,7 +1992,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1247,19 +2033,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX8GISEL-NEXT: .LBB5_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX8GISEL-NEXT: .LBB5_4: ; %endif +; GFX8GISEL-NEXT: .LBB8_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1296,19 +2082,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9GISEL-NEXT: .LBB5_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX9GISEL-NEXT: .LBB5_4: ; %endif +; GFX9GISEL-NEXT: .LBB8_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1345,19 +2131,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX1064GISEL-NEXT: .LBB5_4: ; %endif +; GFX1064GISEL-NEXT: .LBB8_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1394,19 +2180,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX1032GISEL-NEXT: .LBB5_4: ; %endif +; GFX1032GISEL-NEXT: .LBB8_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1447,19 +2233,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX1164GISEL-NEXT: .LBB5_4: ; %endif +; GFX1164GISEL-NEXT: .LBB8_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1498,19 +2284,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX1132GISEL-NEXT: .LBB5_4: ; %endif +; GFX1132GISEL-NEXT: .LBB8_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll index d6ccf7ce2831..89487bc210bf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll @@ -367,6 +367,784 @@ entry: ret void } +define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value_dpp: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value_dpp: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %in, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %id.x, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: default_stratergy: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: default_stratergy: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: default_stratergy: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: default_stratergy: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: default_stratergy: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: default_stratergy: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: default_stratergy: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: default_stratergy: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: default_stratergy: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: default_stratergy: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: default_stratergy: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: default_stratergy: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.or.i32(i32 %id.x, i32 0) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -382,20 +1160,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_or_b32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -410,26 +1188,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s6, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB5_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_or_b32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX8GISEL-NEXT: .LBB2_5: ; %endif +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX8GISEL-NEXT: .LBB5_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -453,20 +1231,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_or_b32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -480,26 +1258,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s6, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB5_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_or_b32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX9GISEL-NEXT: .LBB2_5: ; %endif +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX9GISEL-NEXT: .LBB5_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -522,20 +1300,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_or_b32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -549,26 +1327,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_or_b32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1064GISEL-NEXT: .LBB5_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -591,20 +1369,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_or_b32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -618,26 +1396,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s0, s0 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_or_b32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1032GISEL-NEXT: .LBB2_5: ; %endif +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1032GISEL-NEXT: .LBB5_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -662,21 +1440,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_or_b32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -692,27 +1470,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_or_b32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1164GISEL-NEXT: .LBB5_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -737,21 +1515,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_or_b32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -767,27 +1545,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s0, s0 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_or_b32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1132GISEL-NEXT: .LBB2_5: ; %endif +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1132GISEL-NEXT: .LBB5_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 @@ -925,14 +1703,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX8DAGISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -945,14 +1723,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX8GISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX8GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -965,14 +1743,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX9DAGISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -985,14 +1763,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX9GISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX9GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1005,14 +1783,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX1064DAGISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1024,14 +1802,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX1064GISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX1064GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1043,14 +1821,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1062,14 +1840,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7 ; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7 ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1081,7 +1859,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s6, s[2:3] ; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s6 @@ -1089,7 +1867,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s6 ; GFX1164DAGISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1101,7 +1879,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s6, s[2:3] ; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s6 @@ -1109,7 +1887,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s6 ; GFX1164GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1121,7 +1899,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 @@ -1129,7 +1907,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1140,7 +1918,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3 @@ -1148,7 +1926,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1190,19 +1968,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX8GISEL-NEXT: .LBB5_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX8GISEL-NEXT: .LBB5_4: ; %endif +; GFX8GISEL-NEXT: .LBB8_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1239,19 +2017,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9GISEL-NEXT: .LBB5_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX9GISEL-NEXT: .LBB5_4: ; %endif +; GFX9GISEL-NEXT: .LBB8_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1288,19 +2066,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX1064GISEL-NEXT: .LBB5_4: ; %endif +; GFX1064GISEL-NEXT: .LBB8_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1337,19 +2115,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX1032GISEL-NEXT: .LBB5_4: ; %endif +; GFX1032GISEL-NEXT: .LBB8_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1390,19 +2168,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX1164GISEL-NEXT: .LBB5_4: ; %endif +; GFX1164GISEL-NEXT: .LBB8_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1441,19 +2219,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX1132GISEL-NEXT: .LBB5_4: ; %endif +; GFX1132GISEL-NEXT: .LBB8_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll index f09421373168..d3341eb0f809 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll @@ -478,6 +478,956 @@ entry: ret void } +define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_sub_i32 s3, 0, s6 +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s6 +; GFX8GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, s6 +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_sub_i32 s3, 0, s6 +; GFX9GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_sub_i32 s3, 0, s6 +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_sub_i32 s3, 0, s6 +; GFX1064GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_sub_i32 s2, 0, s2 +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_sub_i32 s2, 0, s2 +; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_sub_i32 s3, 0, s6 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_sub_i32 s3, 0, s6 +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s3, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_sub_i32 s2, 0, s2 +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_sub_i32 s2, 0, s2 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +; +; GFX12DAGISEL-LABEL: uniform_value_dpp: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX12DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: s_sub_co_i32 s2, 0, s2 +; GFX12DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12DAGISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %in, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: s_sub_i32 s2, 0, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: s_sub_i32 s2, 0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: s_sub_i32 s2, 0, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: s_sub_i32 s2, 0, s3 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +; +; GFX12DAGISEL-LABEL: divergent_value_dpp: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX12DAGISEL-NEXT: s_wait_dscnt 0x0 +; GFX12DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX12DAGISEL-NEXT: s_sub_co_i32 s2, 0, s3 +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX12DAGISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %id.x, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: default_stratergy: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: default_stratergy: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: default_stratergy: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: default_stratergy: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: default_stratergy: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: default_stratergy: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: default_stratergy: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: s_sub_i32 s2, 0, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s2 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: default_stratergy: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: s_sub_i32 s2, 0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: default_stratergy: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: default_stratergy: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: s_sub_i32 s2, 0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: default_stratergy: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: s_sub_i32 s2, 0, s3 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: default_stratergy: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: s_sub_i32 s2, 0, s3 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +; +; GFX12DAGISEL-LABEL: default_stratergy: +; GFX12DAGISEL: ; %bb.0: ; %entry +; GFX12DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX12DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX12DAGISEL-NEXT: s_wait_dscnt 0x0 +; GFX12DAGISEL-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX12DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX12DAGISEL-NEXT: s_sub_co_i32 s2, 0, s3 +; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s2 +; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX12DAGISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.sub.i32(i32 %id.x, i32 0) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -485,7 +1435,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else ; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -494,24 +1444,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_sub_i32 s3, 0, s6 ; GFX8DAGISEL-NEXT: s_mul_i32 s2, s3, s2 -; GFX8DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX8DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_sub_i32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -526,7 +1476,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -535,20 +1485,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_sub_i32 s3, 0, s6 ; GFX8GISEL-NEXT: s_mul_i32 s6, s3, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB5_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_sub_i32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX8GISEL-NEXT: .LBB2_5: ; %endif +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX8GISEL-NEXT: .LBB5_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -564,7 +1514,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else ; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -573,24 +1523,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_sub_i32 s3, 0, s6 ; GFX9DAGISEL-NEXT: s_mul_i32 s2, s3, s2 -; GFX9DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX9DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_sub_i32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -604,7 +1554,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -613,20 +1563,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_sub_i32 s3, 0, s6 ; GFX9GISEL-NEXT: s_mul_i32 s6, s3, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB5_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_sub_i32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX9GISEL-NEXT: .LBB2_5: ; %endif +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX9GISEL-NEXT: .LBB5_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -641,7 +1591,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -650,24 +1600,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_sub_i32 s3, 0, s6 ; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s3, s2 -; GFX1064DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_sub_i32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -681,7 +1631,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -690,20 +1640,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_sub_i32 s3, 0, s6 ; GFX1064GISEL-NEXT: s_mul_i32 s6, s3, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_sub_i32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1064GISEL-NEXT: .LBB5_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -718,7 +1668,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -727,24 +1677,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_sub_i32 s1, 0, s1 ; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX1032DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_sub_i32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -758,7 +1708,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo @@ -767,20 +1717,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_sub_i32 s0, 0, s0 ; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_sub_i32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1032GISEL-NEXT: .LBB2_5: ; %endif +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1032GISEL-NEXT: .LBB5_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -797,7 +1747,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -807,25 +1757,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_sub_i32 s3, 0, s6 ; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s3, s2 -; GFX1164DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_sub_i32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -841,7 +1791,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -851,21 +1801,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_sub_i32 s3, 0, s6 ; GFX1164GISEL-NEXT: s_mul_i32 s6, s3, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_sub_i32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1164GISEL-NEXT: .LBB5_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -882,7 +1832,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -892,25 +1842,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_sub_i32 s1, 0, s1 ; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX1132DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_sub_i32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -926,7 +1876,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo @@ -936,21 +1886,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_sub_i32 s0, 0, s0 ; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_sub_i32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1132GISEL-NEXT: .LBB2_5: ; %endif +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1132GISEL-NEXT: .LBB5_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 @@ -966,7 +1916,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX12DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX12DAGISEL-NEXT: ; %bb.1: ; %else ; GFX12DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -976,15 +1926,15 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12DAGISEL-NEXT: s_sub_co_i32 s1, 0, s1 ; GFX12DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX12DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX12DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX12DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX12DAGISEL-NEXT: ; %bb.3: ; %if ; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX12DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX12DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX12DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 @@ -992,10 +1942,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX12DAGISEL-NEXT: s_sub_co_i32 s1, s1, s6 ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX12DAGISEL-NEXT: ; %bb.5: ; GFX12DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX12DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX12DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX12DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX12DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -1300,7 +2250,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX8DAGISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1308,7 +2258,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8 ; GFX8DAGISEL-NEXT: s_subb_u32 s5, s5, s10 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1321,7 +2271,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX8GISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX8GISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1329,7 +2279,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s8 ; GFX8GISEL-NEXT: s_subb_u32 s5, s5, s10 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1342,7 +2292,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX9DAGISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1350,7 +2300,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s8 ; GFX9DAGISEL-NEXT: s_subb_u32 s5, s5, s10 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1363,7 +2313,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX9GISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX9GISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1371,7 +2321,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s8 ; GFX9GISEL-NEXT: s_subb_u32 s5, s5, s10 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1384,7 +2334,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX1064DAGISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1392,7 +2342,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_sub_u32 s4, s4, s9 ; GFX1064DAGISEL-NEXT: s_subb_u32 s5, s5, s10 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1404,7 +2354,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s8, s[6:7] ; GFX1064GISEL-NEXT: v_readlane_b32 s9, v2, s8 ; GFX1064GISEL-NEXT: v_readlane_b32 s10, v3, s8 @@ -1412,7 +2362,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_sub_u32 s4, s4, s9 ; GFX1064GISEL-NEXT: s_subb_u32 s5, s5, s10 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1424,7 +2374,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7 @@ -1432,7 +2382,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_sub_u32 s4, s4, s8 ; GFX1032DAGISEL-NEXT: s_subb_u32 s5, s5, s9 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1444,7 +2394,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7 ; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7 @@ -1452,7 +2402,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_sub_u32 s4, s4, s8 ; GFX1032GISEL-NEXT: s_subb_u32 s5, s5, s9 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1464,7 +2414,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s4, s[2:3] ; GFX1164DAGISEL-NEXT: v_readlane_b32 s5, v2, s4 @@ -1473,7 +2423,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_sub_u32 s0, s0, s5 ; GFX1164DAGISEL-NEXT: s_subb_u32 s1, s1, s6 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1485,7 +2435,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s4, s[2:3] ; GFX1164GISEL-NEXT: v_readlane_b32 s5, v2, s4 @@ -1494,7 +2444,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_sub_u32 s0, s0, s5 ; GFX1164GISEL-NEXT: s_subb_u32 s1, s1, s6 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1506,7 +2456,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 @@ -1515,7 +2465,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_sub_u32 s0, s0, s4 ; GFX1132DAGISEL-NEXT: s_subb_u32 s1, s1, s5 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1526,7 +2476,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3 @@ -1535,7 +2485,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_sub_u32 s0, s0, s4 ; GFX1132GISEL-NEXT: s_subb_u32 s1, s1, s5 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1550,7 +2500,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX12DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX12DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX12DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) @@ -1560,7 +2510,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX12DAGISEL-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[4:5] -; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX12DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX12DAGISEL-NEXT: ; %bb.2: ; GFX12DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GFX12DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 @@ -1581,7 +2531,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1594,13 +2544,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8DAGISEL-NEXT: s_mul_i32 s3, s3, s7 ; GFX8DAGISEL-NEXT: s_add_u32 s2, s2, s3 ; GFX8DAGISEL-NEXT: s_add_u32 s7, s2, s10 -; GFX8DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX8DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1614,7 +2564,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8DAGISEL-NEXT: s_add_u32 s7, s4, s8 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX8DAGISEL-NEXT: .LBB5_4: ; %endif +; GFX8DAGISEL-NEXT: .LBB8_4: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1628,7 +2578,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1641,10 +2591,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: s_mul_i32 s3, s3, s7 ; GFX8GISEL-NEXT: s_add_u32 s2, s2, s3 ; GFX8GISEL-NEXT: s_add_u32 s7, s2, s10 -; GFX8GISEL-NEXT: .LBB5_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec @@ -1658,7 +2608,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: s_mul_i32 s5, s5, s7 ; GFX8GISEL-NEXT: s_add_u32 s4, s4, s5 ; GFX8GISEL-NEXT: s_add_u32 s7, s4, s8 -; GFX8GISEL-NEXT: .LBB5_4: ; %endif +; GFX8GISEL-NEXT: .LBB8_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1675,7 +2625,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1688,13 +2638,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9DAGISEL-NEXT: s_mul_i32 s3, s3, s5 ; GFX9DAGISEL-NEXT: s_add_u32 s2, s2, s3 ; GFX9DAGISEL-NEXT: s_add_u32 s5, s2, s10 -; GFX9DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX9DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1708,7 +2658,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9DAGISEL-NEXT: s_add_u32 s5, s5, s8 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9DAGISEL-NEXT: .LBB5_4: ; %endif +; GFX9DAGISEL-NEXT: .LBB8_4: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9DAGISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1721,7 +2671,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1734,10 +2684,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: s_mul_i32 s3, s3, s7 ; GFX9GISEL-NEXT: s_add_u32 s2, s2, s3 ; GFX9GISEL-NEXT: s_add_u32 s7, s2, s10 -; GFX9GISEL-NEXT: .LBB5_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -1751,7 +2701,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: s_mul_i32 s5, s8, s5 ; GFX9GISEL-NEXT: s_add_u32 s4, s7, s4 ; GFX9GISEL-NEXT: s_add_u32 s7, s4, s5 -; GFX9GISEL-NEXT: .LBB5_4: ; %endif +; GFX9GISEL-NEXT: .LBB8_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1768,7 +2718,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1064DAGISEL-NEXT: s_mov_b64 s[8:9], exec ; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s8, s[8:9] @@ -1781,7 +2731,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064DAGISEL-NEXT: s_add_u32 s3, s10, s3 ; GFX1064DAGISEL-NEXT: s_mul_i32 s8, s2, s8 ; GFX1064DAGISEL-NEXT: s_add_u32 s9, s3, s9 -; GFX1064DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[4:5] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s8 @@ -1813,7 +2763,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1826,10 +2776,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: s_add_u32 s3, s10, s3 ; GFX1064GISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX1064GISEL-NEXT: s_add_u32 s7, s3, s7 -; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -1843,7 +2793,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: s_add_u32 s7, s8, s7 ; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4 ; GFX1064GISEL-NEXT: s_add_u32 s7, s7, s5 -; GFX1064GISEL-NEXT: .LBB5_4: ; %endif +; GFX1064GISEL-NEXT: .LBB8_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1860,7 +2810,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1873,7 +2823,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032DAGISEL-NEXT: s_add_u32 s3, s9, s3 ; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s4 ; GFX1032DAGISEL-NEXT: s_add_u32 s5, s3, s5 -; GFX1032DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -1905,7 +2855,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6 @@ -1918,10 +2868,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: s_add_u32 s3, s9, s3 ; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX1032GISEL-NEXT: s_add_u32 s7, s3, s7 -; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo @@ -1935,7 +2885,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: s_add_u32 s5, s5, s7 ; GFX1032GISEL-NEXT: s_mul_i32 s6, s6, s3 ; GFX1032GISEL-NEXT: s_add_u32 s7, s5, s4 -; GFX1032GISEL-NEXT: .LBB5_4: ; %endif +; GFX1032GISEL-NEXT: .LBB8_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1954,7 +2904,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[6:7], exec, s[6:7] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1164DAGISEL-NEXT: s_mov_b64 s[8:9], exec ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1969,7 +2919,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164DAGISEL-NEXT: s_add_u32 s3, s10, s3 ; GFX1164DAGISEL-NEXT: s_mul_i32 s8, s2, s8 ; GFX1164DAGISEL-NEXT: s_add_u32 s9, s3, s9 -; GFX1164DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[6:7] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s8 @@ -2005,7 +2955,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -2020,10 +2970,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_add_u32 s3, s10, s3 ; GFX1164GISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX1164GISEL-NEXT: s_add_u32 s7, s3, s7 -; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec @@ -2039,7 +2989,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_add_u32 s5, s8, s5 ; GFX1164GISEL-NEXT: s_mul_i32 s6, s4, s6 ; GFX1164GISEL-NEXT: s_add_u32 s7, s5, s7 -; GFX1164GISEL-NEXT: .LBB5_4: ; %endif +; GFX1164GISEL-NEXT: .LBB8_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -2058,7 +3008,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -2073,7 +3023,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132DAGISEL-NEXT: s_add_u32 s3, s9, s3 ; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX1132DAGISEL-NEXT: s_add_u32 s7, s3, s7 -; GFX1132DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -2108,7 +3058,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -2123,10 +3073,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_add_u32 s3, s9, s3 ; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX1132GISEL-NEXT: s_add_u32 s7, s3, s7 -; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo @@ -2142,7 +3092,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_add_u32 s5, s7, s5 ; GFX1132GISEL-NEXT: s_mul_i32 s6, s4, s3 ; GFX1132GISEL-NEXT: s_add_u32 s7, s5, s8 -; GFX1132GISEL-NEXT: .LBB5_4: ; %endif +; GFX1132GISEL-NEXT: .LBB8_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 @@ -2160,7 +3110,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX12DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX12DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX12DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX12DAGISEL-NEXT: ; %bb.1: ; %else ; GFX12DAGISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX12DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -2175,7 +3125,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX12DAGISEL-NEXT: s_add_co_u32 s3, s9, s3 ; GFX12DAGISEL-NEXT: s_mul_i32 s6, s2, s6 ; GFX12DAGISEL-NEXT: s_add_co_u32 s7, s3, s7 -; GFX12DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX12DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX12DAGISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX12DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index 24084de87757..f6e3b0ed78b2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -368,6 +368,784 @@ entry: ret void } +define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value_dpp: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value_dpp: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: divergent_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: default_stratergy: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: default_stratergy: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: default_stratergy: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: default_stratergy: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: default_stratergy: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: default_stratergy: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: default_stratergy: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: default_stratergy: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: default_stratergy: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: default_stratergy: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: default_stratergy: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: default_stratergy: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 0) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -383,20 +1161,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -411,30 +1189,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s2, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB5_2: ; %Flow ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8GISEL-NEXT: ; %bb.5: ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8GISEL-NEXT: .LBB2_6: ; %endif +; GFX8GISEL-NEXT: .LBB5_6: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -457,20 +1235,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -484,30 +1262,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s2, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB5_2: ; %Flow ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9GISEL-NEXT: ; %bb.5: ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9GISEL-NEXT: .LBB2_6: ; %endif +; GFX9GISEL-NEXT: .LBB5_6: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -529,20 +1307,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -556,26 +1334,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1064GISEL-NEXT: .LBB5_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -598,20 +1376,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_max_u32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -625,30 +1403,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s1, 0 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_max_u32 s1, s1, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032GISEL-NEXT: ; %bb.5: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032GISEL-NEXT: .LBB2_6: ; %endif +; GFX1032GISEL-NEXT: .LBB5_6: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -672,21 +1450,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -702,27 +1480,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_max_u32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1164GISEL-NEXT: .LBB5_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -747,21 +1525,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_max_u32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -777,31 +1555,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s1, 0 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_max_u32 s1, s1, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132GISEL-NEXT: ; %bb.5: ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132GISEL-NEXT: .LBB2_6: ; %endif +; GFX1132GISEL-NEXT: .LBB5_6: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -1079,7 +1857,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1090,7 +1868,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1103,7 +1881,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1114,7 +1892,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1127,7 +1905,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1138,7 +1916,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1151,7 +1929,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1162,7 +1940,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1175,7 +1953,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1186,7 +1964,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1198,7 +1976,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1209,7 +1987,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1221,7 +1999,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1232,7 +2010,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1244,7 +2022,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1255,7 +2033,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1267,7 +2045,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1279,7 +2057,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1291,7 +2069,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1303,7 +2081,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1315,7 +2093,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -1326,7 +2104,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1337,7 +2115,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -1348,7 +2126,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB6_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1389,24 +2167,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB7_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX8GISEL-NEXT: .LBB7_2: ; %Flow +; GFX8GISEL-NEXT: .LBB10_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB7_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB10_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX8GISEL-NEXT: .LBB7_4: ; %endif +; GFX8GISEL-NEXT: .LBB10_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1441,24 +2219,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB7_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9GISEL-NEXT: .LBB7_2: ; %Flow +; GFX9GISEL-NEXT: .LBB10_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB7_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB10_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9GISEL-NEXT: .LBB7_4: ; %endif +; GFX9GISEL-NEXT: .LBB10_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1493,19 +2271,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB7_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064GISEL-NEXT: .LBB7_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB10_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB7_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB10_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX1064GISEL-NEXT: .LBB7_4: ; %endif +; GFX1064GISEL-NEXT: .LBB10_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1542,24 +2320,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB7_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032GISEL-NEXT: .LBB7_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB10_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB7_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB10_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032GISEL-NEXT: .LBB7_4: ; %endif +; GFX1032GISEL-NEXT: .LBB10_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1598,19 +2376,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1164GISEL-NEXT: .LBB7_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB10_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB7_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB10_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX1164GISEL-NEXT: .LBB7_4: ; %endif +; GFX1164GISEL-NEXT: .LBB10_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1649,23 +2427,23 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1132GISEL-NEXT: .LBB7_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB10_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB7_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB10_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1132GISEL-NEXT: .LBB7_4: ; %endif +; GFX1132GISEL-NEXT: .LBB10_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 9ff4fba28253..4f01ea425aca 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -368,6 +368,784 @@ entry: ret void } +define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX10DAGISEL-LABEL: uniform_value_dpp: +; GFX10DAGISEL: ; %bb.0: ; %entry +; GFX10DAGISEL-NEXT: s_clause 0x1 +; GFX10DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10DAGISEL-NEXT: s_endpgm +; +; GFX10GISEL-LABEL: uniform_value_dpp: +; GFX10GISEL: ; %bb.0: ; %entry +; GFX10GISEL-NEXT: s_clause 0x1 +; GFX10GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %in, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1032DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1032GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %id.x, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: default_stratergy: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: default_stratergy: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: default_stratergy: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: default_stratergy: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: default_stratergy: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: default_stratergy: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: default_stratergy: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1032DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: default_stratergy: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1032GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: default_stratergy: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: default_stratergy: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: default_stratergy: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: default_stratergy: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, -1, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.umin.i32(i32 %id.x, i32 0) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -383,20 +1161,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -411,30 +1189,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b32 s2, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB5_2: ; %Flow ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, -1 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8GISEL-NEXT: ; %bb.5: ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8GISEL-NEXT: .LBB2_6: ; %endif +; GFX8GISEL-NEXT: .LBB5_6: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -457,20 +1235,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -484,30 +1262,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b32 s2, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB5_2: ; %Flow ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, -1 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9GISEL-NEXT: ; %bb.5: ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9GISEL-NEXT: .LBB2_6: ; %endif +; GFX9GISEL-NEXT: .LBB5_6: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -529,20 +1307,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -556,26 +1334,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, -1 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1064GISEL-NEXT: .LBB5_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -598,20 +1376,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, -1 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_min_u32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -625,30 +1403,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s1, -1 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_min_u32 s1, s1, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032GISEL-NEXT: ; %bb.5: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032GISEL-NEXT: .LBB2_6: ; %endif +; GFX1032GISEL-NEXT: .LBB5_6: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -672,21 +1450,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, -1 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -702,27 +1480,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX1164GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b32 s6, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, -1 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_min_u32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1164GISEL-NEXT: .LBB5_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -747,21 +1525,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, -1 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_min_u32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -777,31 +1555,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b32 s1, s1 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s1, -1 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_min_u32 s1, s1, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132GISEL-NEXT: ; %bb.5: ; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132GISEL-NEXT: .LBB2_6: ; %endif +; GFX1132GISEL-NEXT: .LBB5_6: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -939,7 +1717,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -950,7 +1728,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -963,7 +1741,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX8GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -974,7 +1752,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX8GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -987,7 +1765,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -998,7 +1776,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1011,7 +1789,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1022,7 +1800,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX9GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1035,7 +1813,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1046,7 +1824,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1058,7 +1836,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s12, s[6:7] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1069,7 +1847,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s12 ; GFX1064GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1081,7 +1859,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1092,7 +1870,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1104,7 +1882,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], -1 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v4, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v5, s5 @@ -1115,7 +1893,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1127,7 +1905,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1139,7 +1917,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1151,7 +1929,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s8, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, s0 @@ -1163,7 +1941,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s8 ; GFX1164GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1175,7 +1953,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -1186,7 +1964,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1197,7 +1975,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], -1 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 @@ -1208,7 +1986,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1249,24 +2027,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX8GISEL-NEXT: .LBB5_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX8GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX8GISEL-NEXT: .LBB5_4: ; %endif +; GFX8GISEL-NEXT: .LBB8_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1301,24 +2079,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX9GISEL-NEXT: .LBB5_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9GISEL-NEXT: s_xor_b64 exec, exec, s[2:3] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9GISEL-NEXT: .LBB5_4: ; %endif +; GFX9GISEL-NEXT: .LBB8_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1353,19 +2131,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], s[6:7] -; GFX1064GISEL-NEXT: .LBB5_4: ; %endif +; GFX1064GISEL-NEXT: .LBB8_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1402,24 +2180,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], s[6:7] ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032GISEL-NEXT: .LBB5_4: ; %endif +; GFX1032GISEL-NEXT: .LBB8_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1458,19 +2236,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] -; GFX1164GISEL-NEXT: .LBB5_4: ; %endif +; GFX1164GISEL-NEXT: .LBB8_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1509,23 +2287,23 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[6:7], s[2:3] -; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s2 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[4:5], s[4:5] ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1132GISEL-NEXT: .LBB5_4: ; %endif +; GFX1132GISEL-NEXT: .LBB8_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1132GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll index d5f1750c268a..63c42dee8a71 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll @@ -443,6 +443,860 @@ entry: ret void } +define amdgpu_kernel void @uniform_value_dpp(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: uniform_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: uniform_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: uniform_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: uniform_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_clause 0x1 +; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: uniform_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_clause 0x1 +; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: uniform_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_clause 0x1 +; GFX1032DAGISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032DAGISEL-NEXT: s_and_b32 s3, s3, 1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: uniform_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_clause 0x1 +; GFX1032GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032GISEL-NEXT: s_and_b32 s3, s3, 1 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: uniform_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: s_clause 0x1 +; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: uniform_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: s_clause 0x1 +; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec +; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164GISEL-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: s_mul_i32 s2, s6, s2 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: uniform_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: s_clause 0x1 +; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132DAGISEL-NEXT: s_and_b32 s3, s3, 1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: uniform_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: s_clause 0x1 +; GFX1132GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo +; GFX1132GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132GISEL-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: s_and_b32 s3, s3, 1 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: s_mul_i32 s2, s2, s3 +; GFX1132GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %in, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @divergent_value_dpp(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: divergent_value_dpp: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: divergent_value_dpp: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: divergent_value_dpp: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: divergent_value_dpp: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: divergent_value_dpp: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: divergent_value_dpp: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: divergent_value_dpp: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: divergent_value_dpp: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: divergent_value_dpp: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: divergent_value_dpp: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: divergent_value_dpp: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: divergent_value_dpp: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %id.x, i32 2) + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) { +; GFX8DAGISEL-LABEL: default_stratergy: +; GFX8DAGISEL: ; %bb.0: ; %entry +; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: s_nop 1 +; GFX8DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8DAGISEL-NEXT: s_endpgm +; +; GFX8GISEL-LABEL: default_stratergy: +; GFX8GISEL: ; %bb.0: ; %entry +; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX8GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: s_nop 1 +; GFX8GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX8GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX8GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8GISEL-NEXT: flat_store_dword v[2:3], v0 +; GFX8GISEL-NEXT: s_endpgm +; +; GFX9DAGISEL-LABEL: default_stratergy: +; GFX9DAGISEL: ; %bb.0: ; %entry +; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: s_nop 1 +; GFX9DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9DAGISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9DAGISEL-NEXT: s_endpgm +; +; GFX9GISEL-LABEL: default_stratergy: +; GFX9GISEL: ; %bb.0: ; %entry +; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[2:3] +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: s_nop 1 +; GFX9GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xf bank_mask:0xf +; GFX9GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9GISEL-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9GISEL-NEXT: s_endpgm +; +; GFX1064DAGISEL-LABEL: default_stratergy: +; GFX1064DAGISEL: ; %bb.0: ; %entry +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1064DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1064DAGISEL-NEXT: global_store_dword v0, v4, s[0:1] +; GFX1064DAGISEL-NEXT: s_endpgm +; +; GFX1064GISEL-LABEL: default_stratergy: +; GFX1064GISEL: ; %bb.0: ; %entry +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1064GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1064GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1064GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1064GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1064GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1064GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1064GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064GISEL-NEXT: global_store_dword v4, v0, s[0:1] +; GFX1064GISEL-NEXT: s_endpgm +; +; GFX1032DAGISEL-LABEL: default_stratergy: +; GFX1032DAGISEL: ; %bb.0: ; %entry +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032DAGISEL-NEXT: s_waitcnt_depctr depctr_vm_vsrc(0) +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1032DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s3 +; GFX1032DAGISEL-NEXT: global_store_dword v0, v3, s[0:1] +; GFX1032DAGISEL-NEXT: s_endpgm +; +; GFX1032GISEL-LABEL: default_stratergy: +; GFX1032GISEL: ; %bb.0: ; %entry +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1032GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX1032GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1032GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032GISEL-NEXT: global_store_dword v3, v0, s[0:1] +; GFX1032GISEL-NEXT: s_endpgm +; +; GFX1164DAGISEL-LABEL: default_stratergy: +; GFX1164DAGISEL: ; %bb.0: ; %entry +; GFX1164DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164DAGISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164DAGISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v4, s4 +; GFX1164DAGISEL-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX1164DAGISEL-NEXT: s_endpgm +; +; GFX1164GISEL-LABEL: default_stratergy: +; GFX1164GISEL: ; %bb.0: ; %entry +; GFX1164GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1164GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1] +; GFX1164GISEL-NEXT: v_mbcnt_lo_u32_b32 v3, -1, 0 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_add_nc_u32_e32 v3, 32, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: v_mul_lo_u32 v3, 4, v3 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1164GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: ds_permute_b32 v2, v3, v1 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[0:1] +; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1164GISEL-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164GISEL-NEXT: v_readlane_b32 s4, v1, 63 +; GFX1164GISEL-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164GISEL-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164GISEL-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX1164GISEL-NEXT: s_endpgm +; +; GFX1132DAGISEL-LABEL: default_stratergy: +; GFX1132DAGISEL: ; %bb.0: ; %entry +; GFX1132DAGISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132DAGISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132DAGISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132DAGISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3 +; GFX1132DAGISEL-NEXT: global_store_b32 v0, v3, s[0:1] +; GFX1132DAGISEL-NEXT: s_endpgm +; +; GFX1132GISEL-LABEL: default_stratergy: +; GFX1132GISEL: ; %bb.0: ; %entry +; GFX1132GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1132GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v0, s0 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1132GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1132GISEL-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132GISEL-NEXT: v_readlane_b32 s3, v1, 31 +; GFX1132GISEL-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, 0 +; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132GISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX1132GISEL-NEXT: s_endpgm +entry: + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %result = call i32 @llvm.amdgcn.wave.reduce.xor.i32(i32 %id.x, i32 0) + store i32 %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-LABEL: divergent_cfg: ; GFX8DAGISEL: ; %bb.0: ; %entry @@ -450,7 +1304,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else ; GFX8DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -459,24 +1313,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX8DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX8DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX8DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX8DAGISEL-NEXT: ; %bb.3: ; %if ; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX8DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX8DAGISEL-NEXT: ; %bb.5: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX8DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX8DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -491,7 +1345,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -500,20 +1354,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX8GISEL-NEXT: .LBB2_2: ; %Flow +; GFX8GISEL-NEXT: .LBB5_2: ; %Flow ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX8GISEL-NEXT: s_mov_b32 s6, 0 -; GFX8GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX8GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX8GISEL-NEXT: .LBB2_5: ; %endif +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX8GISEL-NEXT: .LBB5_5: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6 @@ -529,7 +1383,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else ; GFX9DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -538,24 +1392,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX9DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX9DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX9DAGISEL-NEXT: ; %bb.3: ; %if ; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX9DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX9DAGISEL-NEXT: ; %bb.5: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX9DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX9DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -569,7 +1423,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -578,20 +1432,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9GISEL-NEXT: ; implicit-def: $vgpr0 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX9GISEL-NEXT: .LBB2_2: ; %Flow +; GFX9GISEL-NEXT: .LBB5_2: ; %Flow ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX9GISEL-NEXT: s_mov_b32 s6, 0 -; GFX9GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX9GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX9GISEL-NEXT: .LBB2_5: ; %endif +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX9GISEL-NEXT: .LBB5_5: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -606,7 +1460,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr2 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1064DAGISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -615,24 +1469,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX1064DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1064DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1064DAGISEL-NEXT: ; %bb.5: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1064DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -646,7 +1500,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_load_dword s6, s[4:5], 0x2c ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -655,20 +1509,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX1064GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1064GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1064GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s7, s[2:3] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1064GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1064GISEL-NEXT: .LBB2_5: ; %endif +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1064GISEL-NEXT: .LBB5_5: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -683,7 +1537,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1032DAGISEL-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -692,24 +1546,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX1032DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1032DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1032DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1032DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1032DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032DAGISEL-NEXT: s_xor_b32 s1, s1, s6 ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1032DAGISEL-NEXT: ; %bb.5: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1032DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -723,7 +1577,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr0 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo @@ -732,20 +1586,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mul_i32 s0, s0, s2 -; GFX1032GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s1, s1 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1032GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1032GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s3, s2 ; GFX1032GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1032GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1032GISEL-NEXT: s_xor_b32 s0, s0, s6 ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1032GISEL-NEXT: .LBB2_5: ; %endif +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1032GISEL-NEXT: .LBB5_5: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0 @@ -762,7 +1616,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1164DAGISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec @@ -772,25 +1626,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mul_i32 s2, s6, s2 -; GFX1164DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164DAGISEL-NEXT: s_xor_b64 exec, exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1164DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164DAGISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164DAGISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1164DAGISEL-NEXT: ; %bb.5: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6 -; GFX1164DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1164DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -806,7 +1660,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_load_b32 s6, s[4:5], 0x2c ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec @@ -816,21 +1670,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mul_i32 s6, s6, s2 -; GFX1164GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[0:1], s[0:1] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec ; GFX1164GISEL-NEXT: s_mov_b32 s6, 0 -; GFX1164GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s7, s[2:3] ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: v_readlane_b32 s8, v0, s7 ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s7 ; GFX1164GISEL-NEXT: s_xor_b32 s6, s6, s8 ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1164GISEL-NEXT: .LBB2_5: ; %endif +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1164GISEL-NEXT: .LBB5_5: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -847,7 +1701,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[4:5], 0x2c ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo @@ -857,25 +1711,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mul_i32 s1, s1, s2 -; GFX1132DAGISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132DAGISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX1132DAGISEL-NEXT: s_xor_b32 exec_lo, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB2_6 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_6 ; GFX1132DAGISEL-NEXT: ; %bb.3: ; %if ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132DAGISEL-NEXT: s_mov_b32 s1, 0 -; GFX1132DAGISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_xor_b32 s1, s1, s6 ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB2_4 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB5_4 ; GFX1132DAGISEL-NEXT: ; %bb.5: ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX1132DAGISEL-NEXT: .LBB2_6: ; %endif +; GFX1132DAGISEL-NEXT: .LBB5_6: ; %endif ; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0 @@ -891,7 +1745,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo @@ -901,21 +1755,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132GISEL-NEXT: s_and_b32 s2, s2, 1 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mul_i32 s0, s0, s2 -; GFX1132GISEL-NEXT: .LBB2_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_5 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo ; GFX1132GISEL-NEXT: s_mov_b32 s0, 0 -; GFX1132GISEL-NEXT: .LBB2_4: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB5_4: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: v_readlane_b32 s6, v0, s3 ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_xor_b32 s0, s0, s6 ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB2_4 -; GFX1132GISEL-NEXT: .LBB2_5: ; %endif +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB5_4 +; GFX1132GISEL-NEXT: .LBB5_5: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 @@ -1137,14 +1991,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX8DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX8DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX8DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX8DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8DAGISEL-NEXT: ; %bb.2: ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1157,14 +2011,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX8GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX8GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX8GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX8GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX8GISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX8GISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX8GISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX8GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX8GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8GISEL-NEXT: ; %bb.2: ; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1177,14 +2031,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX9DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX9DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX9DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX9DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9DAGISEL-NEXT: ; %bb.2: ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1197,14 +2051,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX9GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX9GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX9GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX9GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX9GISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX9GISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX9GISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX9GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX9GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9GISEL-NEXT: ; %bb.2: ; GFX9GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1217,14 +2071,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064DAGISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX1064DAGISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX1064DAGISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX1064DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064DAGISEL-NEXT: ; %bb.2: ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1236,14 +2090,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1064GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec -; GFX1064GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1064GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1064GISEL-NEXT: s_ff1_i32_b64 s10, s[6:7] ; GFX1064GISEL-NEXT: v_readlane_b32 s8, v2, s10 ; GFX1064GISEL-NEXT: v_readlane_b32 s9, v3, s10 ; GFX1064GISEL-NEXT: s_bitset0_b64 s[6:7], s10 ; GFX1064GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[6:7], 0 -; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064GISEL-NEXT: ; %bb.2: ; GFX1064GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1255,14 +2109,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032DAGISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s8, v2, s7 ; GFX1032DAGISEL-NEXT: v_readlane_b32 s9, v3, s7 ; GFX1032DAGISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032DAGISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032DAGISEL-NEXT: ; %bb.2: ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1274,14 +2128,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1032GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mov_b64 s[4:5], 0 ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo -; GFX1032GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1032GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1032GISEL-NEXT: s_ff1_i32_b32 s7, s6 ; GFX1032GISEL-NEXT: v_readlane_b32 s8, v2, s7 ; GFX1032GISEL-NEXT: v_readlane_b32 s9, v3, s7 ; GFX1032GISEL-NEXT: s_bitset0_b32 s6, s7 ; GFX1032GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] ; GFX1032GISEL-NEXT: s_cmp_lg_u32 s6, 0 -; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1032GISEL-NEXT: ; %bb.2: ; GFX1032GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v3, s5 @@ -1293,7 +2147,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s6, s[2:3] ; GFX1164DAGISEL-NEXT: v_readlane_b32 s4, v2, s6 @@ -1301,7 +2155,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s6 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164DAGISEL-NEXT: ; %bb.2: ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1313,7 +2167,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec -; GFX1164GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1164GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164GISEL-NEXT: s_ctz_i32_b64 s6, s[2:3] ; GFX1164GISEL-NEXT: v_readlane_b32 s4, v2, s6 @@ -1321,7 +2175,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s6 ; GFX1164GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1164GISEL-NEXT: ; %bb.2: ; GFX1164GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -1333,7 +2187,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132DAGISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132DAGISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132DAGISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132DAGISEL-NEXT: v_readlane_b32 s4, v2, s3 @@ -1341,7 +2195,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132DAGISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132DAGISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132DAGISEL-NEXT: ; %bb.2: ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132DAGISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1352,7 +2206,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX1132GISEL-NEXT: s_mov_b32 s2, exec_lo -; GFX1132GISEL-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1 +; GFX1132GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132GISEL-NEXT: s_ctz_i32_b32 s3, s2 ; GFX1132GISEL-NEXT: v_readlane_b32 s4, v2, s3 @@ -1360,7 +2214,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) { ; GFX1132GISEL-NEXT: s_bitset0_b32 s2, s3 ; GFX1132GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX1132GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_1 +; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1132GISEL-NEXT: ; %bb.2: ; GFX1132GISEL-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX1132GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -1380,7 +2234,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else ; GFX8DAGISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX8DAGISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1388,7 +2242,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_mul_i32 s6, s2, s7 ; GFX8DAGISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX8DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX8DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -1416,7 +2270,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX8GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX8GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX8GISEL-NEXT: ; %bb.1: ; %else ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX8GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1424,10 +2278,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mul_i32 s6, s2, s7 ; GFX8GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX8GISEL-NEXT: .LBB5_2: ; %Flow +; GFX8GISEL-NEXT: .LBB8_2: ; %Flow ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX8GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX8GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX8GISEL-NEXT: ; %bb.3: ; %if ; GFX8GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8GISEL-NEXT: s_mov_b64 s[6:7], exec @@ -1436,7 +2290,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8GISEL-NEXT: s_mul_i32 s6, s4, s7 ; GFX8GISEL-NEXT: s_mul_i32 s7, s5, s7 -; GFX8GISEL-NEXT: .LBB5_4: ; %endif +; GFX8GISEL-NEXT: .LBB8_4: ; %endif ; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX8GISEL-NEXT: v_mov_b32_e32 v3, s1 @@ -1453,7 +2307,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else ; GFX9DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX9DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1461,7 +2315,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_mul_i32 s4, s2, s5 ; GFX9DAGISEL-NEXT: s_mul_i32 s5, s3, s5 -; GFX9DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX9DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -1488,7 +2342,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX9GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX9GISEL-NEXT: ; %bb.1: ; %else ; GFX9GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX9GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1496,10 +2350,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mul_i32 s6, s2, s7 ; GFX9GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX9GISEL-NEXT: .LBB5_2: ; %Flow +; GFX9GISEL-NEXT: .LBB8_2: ; %Flow ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX9GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX9GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX9GISEL-NEXT: ; %bb.3: ; %if ; GFX9GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -1508,7 +2362,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9GISEL-NEXT: s_mul_i32 s6, s6, s4 ; GFX9GISEL-NEXT: s_mul_i32 s7, s7, s4 -; GFX9GISEL-NEXT: .LBB5_4: ; %endif +; GFX9GISEL-NEXT: .LBB8_4: ; %endif ; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1525,7 +2379,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1064DAGISEL-NEXT: s_mov_b64 s[4:5], exec ; GFX1064DAGISEL-NEXT: s_bcnt1_i32_b64 s4, s[4:5] @@ -1533,7 +2387,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_mul_i32 s4, s2, s5 ; GFX1064DAGISEL-NEXT: s_mul_i32 s5, s3, s5 -; GFX1064DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -1560,7 +2414,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1064GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX1064GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1064GISEL-NEXT: ; %bb.1: ; %else ; GFX1064GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX1064GISEL-NEXT: s_bcnt1_i32_b64 s6, s[6:7] @@ -1568,10 +2422,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mul_i32 s6, s2, s7 ; GFX1064GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1064GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1064GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[8:9] -; GFX1064GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1064GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1064GISEL-NEXT: ; %bb.3: ; %if ; GFX1064GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1064GISEL-NEXT: s_mov_b64 s[4:5], exec @@ -1580,7 +2434,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064GISEL-NEXT: s_mul_i32 s6, s6, s4 ; GFX1064GISEL-NEXT: s_mul_i32 s7, s7, s4 -; GFX1064GISEL-NEXT: .LBB5_4: ; %endif +; GFX1064GISEL-NEXT: .LBB8_4: ; %endif ; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1597,7 +2451,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1032DAGISEL-NEXT: s_mov_b32 s4, exec_lo ; GFX1032DAGISEL-NEXT: s_bcnt1_i32_b32 s4, s4 @@ -1605,7 +2459,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_mul_i32 s4, s2, s5 ; GFX1032DAGISEL-NEXT: s_mul_i32 s5, s3, s5 -; GFX1032DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s4 @@ -1632,7 +2486,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1032GISEL-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1032GISEL-NEXT: ; %bb.1: ; %else ; GFX1032GISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX1032GISEL-NEXT: s_bcnt1_i32_b32 s6, s6 @@ -1640,10 +2494,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mul_i32 s6, s2, s7 ; GFX1032GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1032GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1032GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_andn2_saveexec_b32 s2, s8 -; GFX1032GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1032GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1032GISEL-NEXT: ; %bb.3: ; %if ; GFX1032GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo @@ -1652,7 +2506,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032GISEL-NEXT: s_mul_i32 s6, s6, s3 ; GFX1032GISEL-NEXT: s_mul_i32 s7, s7, s3 -; GFX1032GISEL-NEXT: .LBB5_4: ; %endif +; GFX1032GISEL-NEXT: .LBB8_4: ; %endif ; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1671,7 +2525,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1164DAGISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1680,7 +2534,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_mul_i32 s6, s2, s7 ; GFX1164DAGISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1164DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s6 @@ -1711,7 +2565,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1164GISEL-NEXT: s_xor_b64 s[8:9], exec, s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1164GISEL-NEXT: ; %bb.1: ; %else ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec ; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1720,10 +2574,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mul_i32 s6, s2, s7 ; GFX1164GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1164GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1164GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_and_not1_saveexec_b64 s[2:3], s[8:9] -; GFX1164GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1164GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164GISEL-NEXT: ; %bb.3: ; %if ; GFX1164GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1164GISEL-NEXT: s_mov_b64 s[6:7], exec @@ -1733,7 +2587,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164GISEL-NEXT: s_mul_i32 s6, s4, s7 ; GFX1164GISEL-NEXT: s_mul_i32 s7, s5, s7 -; GFX1164GISEL-NEXT: .LBB5_4: ; %endif +; GFX1164GISEL-NEXT: .LBB8_4: ; %endif ; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6 ; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, s7 @@ -1752,7 +2606,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1132DAGISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1761,7 +2615,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_mul_i32 s6, s2, s7 ; GFX1132DAGISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1132DAGISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132DAGISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s2, s8 ; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 @@ -1791,7 +2645,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132GISEL-NEXT: v_cmpx_le_u32_e32 16, v0 ; GFX1132GISEL-NEXT: s_xor_b32 s8, exec_lo, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_2 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132GISEL-NEXT: ; %bb.1: ; %else ; GFX1132GISEL-NEXT: s_mov_b32 s6, exec_lo ; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) @@ -1800,10 +2654,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mul_i32 s6, s2, s7 ; GFX1132GISEL-NEXT: s_mul_i32 s7, s3, s7 -; GFX1132GISEL-NEXT: .LBB5_2: ; %Flow +; GFX1132GISEL-NEXT: .LBB8_2: ; %Flow ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_and_not1_saveexec_b32 s2, s8 -; GFX1132GISEL-NEXT: s_cbranch_execz .LBB5_4 +; GFX1132GISEL-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132GISEL-NEXT: ; %bb.3: ; %if ; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo @@ -1813,7 +2667,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64 ; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132GISEL-NEXT: s_mul_i32 s6, s4, s3 ; GFX1132GISEL-NEXT: s_mul_i32 s7, s5, s3 -; GFX1132GISEL-NEXT: .LBB5_4: ; %endif +; GFX1132GISEL-NEXT: .LBB8_4: ; %endif ; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX1132GISEL-NEXT: v_mov_b32_e32 v2, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll b/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll index caedcc5e44ba..618a4c294cae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sponentry.ll @@ -317,34 +317,50 @@ define amdgpu_gfx ptr addrspace(5) @sponentry_gfx_dyn_alloc(i32 %val) #0 { ; DAGISEL-NEXT: s_wait_samplecnt 0x0 ; DAGISEL-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-NEXT: s_wait_kmcnt 0x0 -; DAGISEL-NEXT: v_lshl_add_u32 v1, v0, 2, 15 -; DAGISEL-NEXT: s_mov_b32 s34, s33 -; DAGISEL-NEXT: s_mov_b32 s1, exec_lo -; DAGISEL-NEXT: s_mov_b32 s0, 0 +; DAGISEL-NEXT: s_mov_b32 s2, s33 ; DAGISEL-NEXT: s_mov_b32 s33, s32 -; DAGISEL-NEXT: v_and_b32_e32 v1, -16, v1 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:8 +; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: v_lshl_add_u32 v3, v0, 2, 15 ; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16 -; DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; DAGISEL-NEXT: v_and_b32_e32 v3, -16, v3 +; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 ; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; DAGISEL-NEXT: s_ctz_i32_b32 s2, s1 +; DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s0 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; DAGISEL-NEXT: s_wait_dscnt 0x0 +; DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_readlane_b32 s1, v1, 31 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_mov_b32 s0, s32 ; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; DAGISEL-NEXT: v_readlane_b32 s3, v1, s2 -; DAGISEL-NEXT: s_bitset0_b32 s1, s2 -; DAGISEL-NEXT: s_max_u32 s0, s0, s3 -; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; DAGISEL-NEXT: s_cmp_lg_u32 s1, 0 -; DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1 -; DAGISEL-NEXT: ; %bb.2: -; DAGISEL-NEXT: s_mov_b32 s1, s32 -; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; DAGISEL-NEXT: v_lshl_add_u32 v1, s0, 5, s1 +; DAGISEL-NEXT: v_lshl_add_u32 v3, s1, 5, s0 ; DAGISEL-NEXT: s_wait_storecnt 0x0 -; DAGISEL-NEXT: scratch_store_b32 off, v0, s1 scope:SCOPE_SYS +; DAGISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; DAGISEL-NEXT: s_wait_storecnt 0x0 ; DAGISEL-NEXT: v_mov_b32_e32 v0, s33 -; DAGISEL-NEXT: v_readfirstlane_b32 s32, v1 +; DAGISEL-NEXT: v_readfirstlane_b32 s32, v3 ; DAGISEL-NEXT: s_mov_b32 s32, s33 -; DAGISEL-NEXT: s_mov_b32 s33, s34 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:8 +; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_mov_b32 s33, s2 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 ; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; DAGISEL-NEXT: s_setpc_b64 s[30:31] ; @@ -355,34 +371,51 @@ define amdgpu_gfx ptr addrspace(5) @sponentry_gfx_dyn_alloc(i32 %val) #0 { ; GISEL-NEXT: s_wait_samplecnt 0x0 ; GISEL-NEXT: s_wait_bvhcnt 0x0 ; GISEL-NEXT: s_wait_kmcnt 0x0 -; GISEL-NEXT: v_lshl_add_u32 v1, v0, 2, 15 -; GISEL-NEXT: s_mov_b32 s34, s33 -; GISEL-NEXT: s_mov_b32 s1, exec_lo -; GISEL-NEXT: s_mov_b32 s0, 0 +; GISEL-NEXT: s_mov_b32 s3, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 -; GISEL-NEXT: v_and_b32_e32 v1, -16, v1 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill +; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:8 +; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: v_lshl_add_u32 v3, v0, 2, 15 ; GISEL-NEXT: s_add_co_i32 s32, s32, 16 -; GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GISEL-NEXT: s_mov_b32 s0, s32 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_and_b32_e32 v3, -16, v3 +; GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GISEL-NEXT: v_readlane_b32 s3, v1, s2 -; GISEL-NEXT: s_bitset0_b32 s1, s2 -; GISEL-NEXT: s_max_u32 s0, s0, s3 -; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GISEL-NEXT: s_cbranch_scc1 .LBB9_1 -; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: s_mov_b32 s1, s32 -; GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15) +; GISEL-NEXT: s_wait_dscnt 0x0 +; GISEL-NEXT: v_max_u32_e32 v1, v1, v2 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_readlane_b32 s2, v1, 31 +; GISEL-NEXT: s_mov_b32 exec_lo, s1 +; GISEL-NEXT: s_lshl_b32 s1, s2, 5 ; GISEL-NEXT: s_wait_storecnt 0x0 -; GISEL-NEXT: scratch_store_b32 off, v0, s1 scope:SCOPE_SYS +; GISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GISEL-NEXT: s_wait_storecnt 0x0 +; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; GISEL-NEXT: s_add_co_u32 s32, s0, s1 ; GISEL-NEXT: v_mov_b32_e32 v0, s33 -; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GISEL-NEXT: s_add_co_u32 s32, s1, s0 ; GISEL-NEXT: s_mov_b32 s32, s33 -; GISEL-NEXT: s_mov_b32 s33, s34 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload +; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:8 +; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_mov_b32 s33, s3 +; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %local = alloca i32, i32 %val, addrspace(5) @@ -499,34 +532,36 @@ define amdgpu_cs_chain void @sponentry_cs_chain_dyn_alloc(i32 %val) #0 { ; DAGISEL-NEXT: s_wait_samplecnt 0x0 ; DAGISEL-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-NEXT: s_wait_kmcnt 0x0 -; DAGISEL-NEXT: v_lshl_add_u32 v0, v8, 2, 15 -; DAGISEL-NEXT: s_mov_b32 s1, exec_lo -; DAGISEL-NEXT: s_mov_b32 s0, 0 +; DAGISEL-NEXT: v_lshl_add_u32 v2, v8, 2, 15 ; DAGISEL-NEXT: s_mov_b32 s33, s32 ; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16 -; DAGISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; DAGISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 -; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; DAGISEL-NEXT: s_ctz_i32_b32 s2, s1 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; DAGISEL-NEXT: v_and_b32_e32 v2, -16, v2 +; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 ; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) +; DAGISEL-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; DAGISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; DAGISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; DAGISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; DAGISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; DAGISEL-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; DAGISEL-NEXT: s_wait_dscnt 0x0 +; DAGISEL-NEXT: v_max_u32_e32 v0, v0, v1 ; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; DAGISEL-NEXT: v_readlane_b32 s3, v0, s2 -; DAGISEL-NEXT: s_bitset0_b32 s1, s2 -; DAGISEL-NEXT: s_max_u32 s0, s0, s3 +; DAGISEL-NEXT: v_readlane_b32 s1, v0, 31 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_mov_b32 s0, s32 +; DAGISEL-NEXT: v_mov_b32_e32 v3, s33 ; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; DAGISEL-NEXT: s_cmp_lg_u32 s1, 0 -; DAGISEL-NEXT: s_cbranch_scc1 .LBB12_1 -; DAGISEL-NEXT: ; %bb.2: -; DAGISEL-NEXT: s_mov_b32 s1, s32 -; DAGISEL-NEXT: v_mov_b32_e32 v1, s33 -; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; DAGISEL-NEXT: v_lshl_add_u32 v0, s0, 5, s1 +; DAGISEL-NEXT: v_lshl_add_u32 v2, s1, 5, s0 ; DAGISEL-NEXT: s_wait_storecnt 0x0 -; DAGISEL-NEXT: scratch_store_b32 off, v8, s1 scope:SCOPE_SYS +; DAGISEL-NEXT: scratch_store_b32 off, v8, s0 scope:SCOPE_SYS ; DAGISEL-NEXT: s_wait_storecnt 0x0 -; DAGISEL-NEXT: scratch_store_b32 off, v1, s1 scope:SCOPE_SYS +; DAGISEL-NEXT: scratch_store_b32 off, v3, s0 scope:SCOPE_SYS ; DAGISEL-NEXT: s_wait_storecnt 0x0 -; DAGISEL-NEXT: v_readfirstlane_b32 s32, v0 +; DAGISEL-NEXT: v_readfirstlane_b32 s32, v2 ; DAGISEL-NEXT: s_alloc_vgpr 0 ; DAGISEL-NEXT: s_endpgm ; @@ -537,34 +572,36 @@ define amdgpu_cs_chain void @sponentry_cs_chain_dyn_alloc(i32 %val) #0 { ; GISEL-NEXT: s_wait_samplecnt 0x0 ; GISEL-NEXT: s_wait_bvhcnt 0x0 ; GISEL-NEXT: s_wait_kmcnt 0x0 -; GISEL-NEXT: v_lshl_add_u32 v0, v8, 2, 15 -; GISEL-NEXT: s_mov_b32 s1, exec_lo -; GISEL-NEXT: s_mov_b32 s0, 0 +; GISEL-NEXT: v_lshl_add_u32 v2, v8, 2, 15 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_add_co_i32 s32, s32, 16 -; GISEL-NEXT: v_and_b32_e32 v0, -16, v0 -; GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GISEL-NEXT: s_ctz_i32_b32 s2, s1 +; GISEL-NEXT: s_mov_b32 s0, s32 +; GISEL-NEXT: v_and_b32_e32 v2, -16, v2 +; GISEL-NEXT: s_or_saveexec_b32 s1, -1 ; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-NEXT: v_readlane_b32 s3, v0, s2 -; GISEL-NEXT: s_bitset0_b32 s1, s2 -; GISEL-NEXT: s_max_u32 s0, s0, s3 -; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GISEL-NEXT: s_cmp_lg_u32 s1, 0 -; GISEL-NEXT: s_cbranch_scc1 .LBB12_1 -; GISEL-NEXT: ; %bb.2: -; GISEL-NEXT: v_mov_b32_e32 v0, s33 -; GISEL-NEXT: s_mov_b32 s1, s32 -; GISEL-NEXT: s_lshl_b32 s0, s0, 5 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v2, s1 +; GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf +; GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf +; GISEL-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15) +; GISEL-NEXT: s_wait_dscnt 0x0 +; GISEL-NEXT: v_max_u32_e32 v0, v0, v1 +; GISEL-NEXT: v_readlane_b32 s2, v0, 31 +; GISEL-NEXT: s_mov_b32 exec_lo, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s33 +; GISEL-NEXT: s_lshl_b32 s1, s2, 5 ; GISEL-NEXT: s_wait_storecnt 0x0 -; GISEL-NEXT: scratch_store_b32 off, v8, s1 scope:SCOPE_SYS -; GISEL-NEXT: s_wait_storecnt 0x0 -; GISEL-NEXT: scratch_store_b32 off, v0, s1 scope:SCOPE_SYS +; GISEL-NEXT: scratch_store_b32 off, v8, s0 scope:SCOPE_SYS ; GISEL-NEXT: s_wait_storecnt 0x0 ; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0) -; GISEL-NEXT: s_add_co_u32 s32, s1, s0 +; GISEL-NEXT: s_add_co_u32 s32, s0, s1 +; GISEL-NEXT: scratch_store_b32 off, v2, s0 scope:SCOPE_SYS +; GISEL-NEXT: s_wait_storecnt 0x0 ; GISEL-NEXT: s_alloc_vgpr 0 ; GISEL-NEXT: s_endpgm %local = alloca i32, i32 %val, addrspace(5)