[AMDGPU] DPP implementations for Wave Reduction (#185814)
Adding DPP reduction support for i32 types. Supported Ops: `umin`, `min`, `umax`, `max`, `add`, `sub`, `and`, `or`, `xor`.
This commit is contained in:
parent
7899b26e88
commit
fbd2467796
@ -4655,7 +4655,7 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
|
||||
SDValue WaveReduction =
|
||||
DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
|
||||
Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
|
||||
Size, DAG.getConstant(0, dl, MVT::i32));
|
||||
Size, DAG.getTargetConstant(0, dl, MVT::i32));
|
||||
SDValue ScaledSize = DAG.getNode(
|
||||
ISD::SHL, dl, VT, Size,
|
||||
DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
|
||||
@ -5639,6 +5639,32 @@ static bool isFloatingPointWaveReduceOperation(unsigned Opc) {
|
||||
Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
|
||||
}
|
||||
|
||||
static unsigned getDPPOpcForWaveReduction(unsigned Opc,
|
||||
const GCNSubtarget &ST) {
|
||||
switch (Opc) {
|
||||
case AMDGPU::S_MIN_U32:
|
||||
return AMDGPU::V_MIN_U32_dpp;
|
||||
case AMDGPU::S_MIN_I32:
|
||||
return AMDGPU::V_MIN_I32_dpp;
|
||||
case AMDGPU::S_MAX_U32:
|
||||
return AMDGPU::V_MAX_U32_dpp;
|
||||
case AMDGPU::S_MAX_I32:
|
||||
return AMDGPU::V_MAX_I32_dpp;
|
||||
case AMDGPU::S_ADD_I32:
|
||||
case AMDGPU::S_SUB_I32:
|
||||
return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
|
||||
: AMDGPU::V_ADD_CO_U32_dpp;
|
||||
case AMDGPU::S_AND_B32:
|
||||
return AMDGPU::V_AND_B32_dpp;
|
||||
case AMDGPU::S_OR_B32:
|
||||
return AMDGPU::V_OR_B32_dpp;
|
||||
case AMDGPU::S_XOR_B32:
|
||||
return AMDGPU::V_XOR_B32_dpp;
|
||||
default:
|
||||
llvm_unreachable("unhandled lane op");
|
||||
}
|
||||
}
|
||||
|
||||
static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
|
||||
MachineBasicBlock &BB,
|
||||
const GCNSubtarget &ST,
|
||||
@ -5652,6 +5678,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
|
||||
Register SrcReg = MI.getOperand(1).getReg();
|
||||
bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
|
||||
Register DstReg = MI.getOperand(0).getReg();
|
||||
unsigned Stratergy = static_cast<unsigned>(MI.getOperand(2).getImm());
|
||||
enum WAVE_REDUCE_STRATEGY : unsigned { DEFAULT = 0, ITERATIVE = 1, DPP = 2 };
|
||||
MachineBasicBlock *RetBB = nullptr;
|
||||
if (isSGPR) {
|
||||
switch (Opc) {
|
||||
@ -5918,267 +5946,431 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// TODO: Implement DPP Strategy and switch based on immediate strategy
|
||||
// operand. For now, for all the cases (default, Iterative and DPP we use
|
||||
// iterative approach by default.)
|
||||
|
||||
// To reduce the VGPR using iterative approach, we need to iterate
|
||||
// over all the active lanes. Lowering consists of ComputeLoop,
|
||||
// which iterate over only active lanes. We use copy of EXEC register
|
||||
// as induction variable and every active lane modifies it using bitset0
|
||||
// so that we will get the next active lane for next iteration.
|
||||
MachineBasicBlock::iterator I = BB.end();
|
||||
Register SrcReg = MI.getOperand(1).getReg();
|
||||
bool is32BitOpc = is32bitWaveReduceOperation(Opc);
|
||||
bool isFPOp = isFloatingPointWaveReduceOperation(Opc);
|
||||
|
||||
// Create Control flow for loop
|
||||
// Split MI's Machine Basic block into For loop
|
||||
auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
|
||||
|
||||
// Create virtual registers required for lowering.
|
||||
const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
|
||||
const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
|
||||
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
|
||||
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
|
||||
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
||||
Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
|
||||
|
||||
const TargetRegisterClass *SrcRegClass = MRI.getRegClass(SrcReg);
|
||||
bool IsWave32 = ST.isWave32();
|
||||
unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
||||
unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
|
||||
!ST.hasDPP()) { // If target doesn't support DPP operations, default to
|
||||
// iterative stratergy
|
||||
|
||||
// Create initial values of induction variable from Exec, Accumulator and
|
||||
// insert branch instr to newly created ComputeBlock
|
||||
BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
|
||||
if (is32BitOpc) {
|
||||
uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
|
||||
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
|
||||
.addImm(IdentityValue);
|
||||
} else {
|
||||
uint64_t IdentityValue =
|
||||
MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
|
||||
? 0x0 // +0.0 for double sub reduction
|
||||
: getIdentityValueFor64BitWaveReduction(Opc);
|
||||
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
|
||||
.addImm(IdentityValue);
|
||||
}
|
||||
// clang-format off
|
||||
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
|
||||
.addMBB(ComputeLoop);
|
||||
// clang-format on
|
||||
// To reduce the VGPR using iterative approach, we need to iterate
|
||||
// over all the active lanes. Lowering consists of ComputeLoop,
|
||||
// which iterate over only active lanes. We use copy of EXEC register
|
||||
// as induction variable and every active lane modifies it using bitset0
|
||||
// so that we will get the next active lane for next iteration.
|
||||
|
||||
// Start constructing ComputeLoop
|
||||
I = ComputeLoop->begin();
|
||||
auto Accumulator =
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
|
||||
.addReg(IdentityValReg)
|
||||
.addMBB(&BB);
|
||||
auto ActiveBits =
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
|
||||
.addReg(LoopIterator)
|
||||
.addMBB(&BB);
|
||||
// Create Control flow for loop
|
||||
// Split MI's Machine Basic block into For loop
|
||||
auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
|
||||
|
||||
I = ComputeLoop->end();
|
||||
MachineInstr *NewAccumulator;
|
||||
// Perform the computations
|
||||
unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
|
||||
.addReg(ActiveBitsReg);
|
||||
if (is32BitOpc) {
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
|
||||
LaneValueReg)
|
||||
.addReg(SrcReg)
|
||||
.addReg(FF1Reg);
|
||||
if (isFPOp) {
|
||||
Register LaneValVreg =
|
||||
MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
|
||||
Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
|
||||
// Get the Lane Value in VGPR to avoid the Constant Bus Restriction
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
|
||||
LaneValVreg)
|
||||
.addReg(LaneValueReg);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
|
||||
.addImm(0) // src0 modifier
|
||||
.addReg(Accumulator->getOperand(0).getReg())
|
||||
.addImm(0) // src1 modifier
|
||||
.addReg(LaneValVreg)
|
||||
.addImm(0) // clamp
|
||||
.addImm(0); // omod
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
|
||||
.addReg(DstVreg);
|
||||
Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
|
||||
Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
|
||||
Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
||||
Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
|
||||
|
||||
// Create initial values of induction variable from Exec, Accumulator and
|
||||
// insert branch instr to newly created ComputeBlock
|
||||
BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
|
||||
if (is32BitOpc) {
|
||||
uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
|
||||
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
|
||||
.addImm(IdentityValue);
|
||||
} else {
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
|
||||
.addReg(Accumulator->getOperand(0).getReg())
|
||||
.addReg(LaneValueReg);
|
||||
}
|
||||
} else {
|
||||
Register LaneValueLoReg =
|
||||
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
Register LaneValueHiReg =
|
||||
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
||||
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
|
||||
const TargetRegisterClass *SrcSubRC =
|
||||
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
|
||||
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
|
||||
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
|
||||
// lane value input should be in an sgpr
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
|
||||
LaneValueLoReg)
|
||||
.add(Op1L)
|
||||
.addReg(FF1Reg);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
|
||||
LaneValueHiReg)
|
||||
.add(Op1H)
|
||||
.addReg(FF1Reg);
|
||||
auto LaneValue = BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
|
||||
.addReg(LaneValueLoReg)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(LaneValueHiReg)
|
||||
.addImm(AMDGPU::sub1);
|
||||
switch (Opc) {
|
||||
case AMDGPU::S_OR_B64:
|
||||
case AMDGPU::S_AND_B64:
|
||||
case AMDGPU::S_XOR_B64: {
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
|
||||
.addReg(Accumulator->getOperand(0).getReg())
|
||||
.addReg(LaneValue->getOperand(0).getReg())
|
||||
.setOperandDead(3); // Dead scc
|
||||
break;
|
||||
}
|
||||
case AMDGPU::V_CMP_GT_I64_e64:
|
||||
case AMDGPU::V_CMP_GT_U64_e64:
|
||||
case AMDGPU::V_CMP_LT_I64_e64:
|
||||
case AMDGPU::V_CMP_LT_U64_e64: {
|
||||
Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
Register ComparisonResultReg =
|
||||
MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
int SrcIdx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
|
||||
const TargetRegisterClass *VregClass =
|
||||
TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
|
||||
const TargetRegisterClass *VSubRegClass =
|
||||
TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
|
||||
Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
|
||||
MachineOperand SrcReg0Sub0 =
|
||||
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
|
||||
VregClass, AMDGPU::sub0, VSubRegClass);
|
||||
MachineOperand SrcReg0Sub1 =
|
||||
TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
|
||||
VregClass, AMDGPU::sub1, VSubRegClass);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
|
||||
AccumulatorVReg)
|
||||
.add(SrcReg0Sub0)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.add(SrcReg0Sub1)
|
||||
.addImm(AMDGPU::sub1);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
|
||||
.addReg(LaneValue->getOperand(0).getReg())
|
||||
.addReg(AccumulatorVReg);
|
||||
|
||||
unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
|
||||
.addReg(LaneMaskReg)
|
||||
.addReg(ActiveBitsReg);
|
||||
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(AMDGPU::S_CSELECT_B64), DstReg)
|
||||
.addReg(LaneValue->getOperand(0).getReg())
|
||||
.addReg(Accumulator->getOperand(0).getReg());
|
||||
break;
|
||||
}
|
||||
case AMDGPU::V_MIN_F64_e64:
|
||||
case AMDGPU::V_MIN_NUM_F64_e64:
|
||||
case AMDGPU::V_MAX_F64_e64:
|
||||
case AMDGPU::V_MAX_NUM_F64_e64:
|
||||
case AMDGPU::V_ADD_F64_e64:
|
||||
case AMDGPU::V_ADD_F64_pseudo_e64: {
|
||||
int SrcIdx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
|
||||
const TargetRegisterClass *VregRC =
|
||||
TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
|
||||
const TargetRegisterClass *VregSubRC =
|
||||
TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
|
||||
Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
|
||||
Register DstVreg = MRI.createVirtualRegister(VregRC);
|
||||
Register LaneValLo =
|
||||
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
Register LaneValHi =
|
||||
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
|
||||
.addReg(Accumulator->getOperand(0).getReg());
|
||||
unsigned Modifier =
|
||||
uint64_t IdentityValue =
|
||||
MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
|
||||
? SISrcMods::NEG
|
||||
: SISrcMods::NONE;
|
||||
auto DstVregInst = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
|
||||
.addImm(Modifier) // src0 modifiers
|
||||
? 0x0 // +0.0 for double sub reduction
|
||||
: getIdentityValueFor64BitWaveReduction(Opc);
|
||||
BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO),
|
||||
IdentityValReg)
|
||||
.addImm(IdentityValue);
|
||||
}
|
||||
// clang-format off
|
||||
BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
|
||||
.addMBB(ComputeLoop);
|
||||
// clang-format on
|
||||
|
||||
// Start constructing ComputeLoop
|
||||
I = ComputeLoop->begin();
|
||||
auto Accumulator =
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
|
||||
.addReg(IdentityValReg)
|
||||
.addMBB(&BB);
|
||||
auto ActiveBits =
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
|
||||
.addReg(LoopIterator)
|
||||
.addMBB(&BB);
|
||||
|
||||
I = ComputeLoop->end();
|
||||
MachineInstr *NewAccumulator;
|
||||
// Perform the computations
|
||||
unsigned SFFOpc =
|
||||
IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
|
||||
.addReg(ActiveBitsReg);
|
||||
if (is32BitOpc) {
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
|
||||
LaneValueReg)
|
||||
.addReg(SrcReg)
|
||||
.addReg(FF1Reg);
|
||||
if (isFPOp) {
|
||||
Register LaneValVreg =
|
||||
MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
|
||||
Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
|
||||
// Get the Lane Value in VGPR to avoid the Constant Bus Restriction
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
|
||||
LaneValVreg)
|
||||
.addReg(LaneValueReg);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
|
||||
.addImm(0) // src0 modifier
|
||||
.addReg(Accumulator->getOperand(0).getReg())
|
||||
.addImm(0) // src1 modifier
|
||||
.addReg(LaneValVreg)
|
||||
.addImm(0) // clamp
|
||||
.addImm(0); // omod
|
||||
NewAccumulator =
|
||||
BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
|
||||
.addReg(DstVreg);
|
||||
} else {
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
|
||||
.addReg(Accumulator->getOperand(0).getReg())
|
||||
.addReg(LaneValueReg);
|
||||
}
|
||||
} else {
|
||||
Register LaneValueLoReg =
|
||||
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
Register LaneValueHiReg =
|
||||
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
Register LaneValReg =
|
||||
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
||||
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
|
||||
const TargetRegisterClass *SrcSubRC =
|
||||
TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
|
||||
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
|
||||
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
|
||||
// lane value input should be in an sgpr
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
|
||||
LaneValueLoReg)
|
||||
.add(Op1L)
|
||||
.addReg(FF1Reg);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
|
||||
LaneValueHiReg)
|
||||
.add(Op1H)
|
||||
.addReg(FF1Reg);
|
||||
auto LaneValue =
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
|
||||
LaneValReg)
|
||||
.addReg(LaneValueLoReg)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(LaneValueHiReg)
|
||||
.addImm(AMDGPU::sub1);
|
||||
switch (Opc) {
|
||||
case AMDGPU::S_OR_B64:
|
||||
case AMDGPU::S_AND_B64:
|
||||
case AMDGPU::S_XOR_B64: {
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
|
||||
.addReg(Accumulator->getOperand(0).getReg())
|
||||
.addReg(LaneValue->getOperand(0).getReg())
|
||||
.addImm(SISrcMods::NONE) // src1 modifiers
|
||||
.addReg(AccumulatorVReg)
|
||||
.addImm(SISrcMods::NONE) // clamp
|
||||
.addImm(SISrcMods::NONE); // omod
|
||||
auto ReadLaneLo =
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
|
||||
LaneValLo);
|
||||
auto ReadLaneHi =
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
|
||||
LaneValHi);
|
||||
MachineBasicBlock::iterator Iters = *ReadLaneLo;
|
||||
MachineOperand Op1L =
|
||||
TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
|
||||
VregRC, AMDGPU::sub0, VregSubRC);
|
||||
MachineOperand Op1H =
|
||||
TII->buildExtractSubRegOrImm(Iters, MRI, DstVregInst->getOperand(0),
|
||||
VregRC, AMDGPU::sub1, VregSubRC);
|
||||
ReadLaneLo.add(Op1L);
|
||||
ReadLaneHi.add(Op1H);
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
|
||||
.addReg(LaneValLo)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(LaneValHi)
|
||||
.addImm(AMDGPU::sub1);
|
||||
break;
|
||||
.setOperandDead(3); // Dead scc
|
||||
break;
|
||||
}
|
||||
case AMDGPU::V_CMP_GT_I64_e64:
|
||||
case AMDGPU::V_CMP_GT_U64_e64:
|
||||
case AMDGPU::V_CMP_LT_I64_e64:
|
||||
case AMDGPU::V_CMP_LT_U64_e64: {
|
||||
Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
Register ComparisonResultReg =
|
||||
MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
int SrcIdx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
|
||||
const TargetRegisterClass *VregClass =
|
||||
TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
|
||||
const TargetRegisterClass *VSubRegClass =
|
||||
TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
|
||||
Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
|
||||
MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Accumulator->getOperand(0), VregClass, AMDGPU::sub0,
|
||||
VSubRegClass);
|
||||
MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Accumulator->getOperand(0), VregClass, AMDGPU::sub1,
|
||||
VSubRegClass);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
|
||||
AccumulatorVReg)
|
||||
.add(SrcReg0Sub0)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.add(SrcReg0Sub1)
|
||||
.addImm(AMDGPU::sub1);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
|
||||
.addReg(LaneValue->getOperand(0).getReg())
|
||||
.addReg(AccumulatorVReg);
|
||||
|
||||
unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
|
||||
.addReg(LaneMaskReg)
|
||||
.addReg(ActiveBitsReg);
|
||||
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(AMDGPU::S_CSELECT_B64), DstReg)
|
||||
.addReg(LaneValue->getOperand(0).getReg())
|
||||
.addReg(Accumulator->getOperand(0).getReg());
|
||||
break;
|
||||
}
|
||||
case AMDGPU::V_MIN_F64_e64:
|
||||
case AMDGPU::V_MIN_NUM_F64_e64:
|
||||
case AMDGPU::V_MAX_F64_e64:
|
||||
case AMDGPU::V_MAX_NUM_F64_e64:
|
||||
case AMDGPU::V_ADD_F64_e64:
|
||||
case AMDGPU::V_ADD_F64_pseudo_e64: {
|
||||
int SrcIdx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
|
||||
const TargetRegisterClass *VregRC =
|
||||
TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
|
||||
const TargetRegisterClass *VregSubRC =
|
||||
TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
|
||||
Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
|
||||
Register DstVreg = MRI.createVirtualRegister(VregRC);
|
||||
Register LaneValLo =
|
||||
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
Register LaneValHi =
|
||||
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
|
||||
.addReg(Accumulator->getOperand(0).getReg());
|
||||
unsigned Modifier =
|
||||
MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
|
||||
? SISrcMods::NEG
|
||||
: SISrcMods::NONE;
|
||||
auto DstVregInst =
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
|
||||
.addImm(Modifier) // src0 modifiers
|
||||
.addReg(LaneValue->getOperand(0).getReg())
|
||||
.addImm(SISrcMods::NONE) // src1 modifiers
|
||||
.addReg(AccumulatorVReg)
|
||||
.addImm(SISrcMods::NONE) // clamp
|
||||
.addImm(SISrcMods::NONE); // omod
|
||||
auto ReadLaneLo =
|
||||
BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
|
||||
auto ReadLaneHi =
|
||||
BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
|
||||
MachineBasicBlock::iterator Iters = *ReadLaneLo;
|
||||
MachineOperand Op1L = TII->buildExtractSubRegOrImm(
|
||||
Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub0,
|
||||
VregSubRC);
|
||||
MachineOperand Op1H = TII->buildExtractSubRegOrImm(
|
||||
Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub1,
|
||||
VregSubRC);
|
||||
ReadLaneLo.add(Op1L);
|
||||
ReadLaneHi.add(Op1H);
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
|
||||
.addReg(LaneValLo)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(LaneValHi)
|
||||
.addImm(AMDGPU::sub1);
|
||||
break;
|
||||
}
|
||||
case AMDGPU::S_ADD_U64_PSEUDO:
|
||||
case AMDGPU::S_SUB_U64_PSEUDO: {
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
|
||||
.addReg(Accumulator->getOperand(0).getReg())
|
||||
.addReg(LaneValue->getOperand(0).getReg());
|
||||
ComputeLoop =
|
||||
Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
case AMDGPU::S_ADD_U64_PSEUDO:
|
||||
case AMDGPU::S_SUB_U64_PSEUDO: {
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
|
||||
.addReg(Accumulator->getOperand(0).getReg())
|
||||
.addReg(LaneValue->getOperand(0).getReg());
|
||||
ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
|
||||
break;
|
||||
// Manipulate the iterator to get the next active lane
|
||||
unsigned BITSETOpc =
|
||||
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
|
||||
.addReg(FF1Reg)
|
||||
.addReg(ActiveBitsReg);
|
||||
|
||||
// Add phi nodes
|
||||
Accumulator.addReg(DstReg).addMBB(ComputeLoop);
|
||||
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
|
||||
|
||||
// Creating branching
|
||||
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
|
||||
.addReg(NewActiveBitsReg)
|
||||
.addImm(0);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
|
||||
.addMBB(ComputeLoop);
|
||||
|
||||
RetBB = ComputeEnd;
|
||||
} else {
|
||||
assert(ST.hasDPP() && "Sub Target does not support DPP Operations");
|
||||
|
||||
Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass);
|
||||
Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass);
|
||||
Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass);
|
||||
Register DPPRowShr1 = MRI.createVirtualRegister(SrcRegClass);
|
||||
Register DPPRowShr2 = MRI.createVirtualRegister(SrcRegClass);
|
||||
Register DPPRowShr4 = MRI.createVirtualRegister(SrcRegClass);
|
||||
Register DPPRowShr8 = MRI.createVirtualRegister(SrcRegClass);
|
||||
Register RowBcast15 = MRI.createVirtualRegister(SrcRegClass);
|
||||
Register ReducedValSGPR = MRI.createVirtualRegister(DstRegClass);
|
||||
Register NegatedReducedVal = MRI.createVirtualRegister(DstRegClass);
|
||||
Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass);
|
||||
Register UndefExec = MRI.createVirtualRegister(WaveMaskRegClass);
|
||||
Register FinalDPPResult;
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
|
||||
|
||||
uint32_t IdentityValue = getIdentityValueFor32BitWaveReduction(Opc);
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), IdentitySGPR)
|
||||
.addImm(IdentityValue);
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
|
||||
.addReg(IdentitySGPR);
|
||||
|
||||
// Set inactive lanes to the identity value.
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SET_INACTIVE_B32), SrcWithIdentity)
|
||||
.addImm(0) // src0 modifiers
|
||||
.addReg(SrcReg) // src0
|
||||
.addImm(0) // src1 modifiers
|
||||
.addReg(IdentityVGPR) // identity value for inactive lanes
|
||||
.addReg(UndefExec); // bool i1
|
||||
|
||||
unsigned DPPOpc = getDPPOpcForWaveReduction(Opc, ST);
|
||||
auto BuildDPPMachineInstr = [&](Register Dst, Register Src,
|
||||
unsigned DPPCtrl) {
|
||||
BuildMI(BB, MI, DL, TII->get(DPPOpc), Dst)
|
||||
.addReg(Src) // old
|
||||
.addReg(Src) // src0
|
||||
.addReg(Src) // src1
|
||||
.addImm(DPPCtrl) // dpp-ctrl
|
||||
.addImm(0xf) // row-mask
|
||||
.addImm(0xf) // bank-mask
|
||||
.addImm(0); // bound-control
|
||||
};
|
||||
// DPP reduction
|
||||
BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentity,
|
||||
AMDGPU::DPP::ROW_SHR_FIRST);
|
||||
|
||||
BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
|
||||
(AMDGPU::DPP::ROW_SHR_FIRST + 1));
|
||||
|
||||
BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
|
||||
(AMDGPU::DPP::ROW_SHR_FIRST + 3));
|
||||
|
||||
BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
|
||||
(AMDGPU::DPP::ROW_SHR_FIRST + 7));
|
||||
|
||||
if (ST.hasDPPBroadcasts()) {
|
||||
BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15);
|
||||
} else {
|
||||
// magic constant: 0x1E0
|
||||
// To Set BIT_MODE : bit 15 = 0
|
||||
// XOR mask : bit [14:10] = 0
|
||||
// OR mask : bit [9:5] = 15
|
||||
// AND mask : bit [4:0] = 0
|
||||
Register SwizzledValue =
|
||||
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32), SwizzledValue)
|
||||
.addReg(DPPRowShr8) // addr
|
||||
.addImm(0x1E0) // swizzle offset (i16)
|
||||
.addImm(0x0); // gds (i1)
|
||||
auto ClampInstr =
|
||||
BuildMI(BB, MI, DL,
|
||||
TII->get(TII->getVALUOp(
|
||||
Opc == AMDGPU::S_SUB_I32
|
||||
? static_cast<unsigned>(AMDGPU::S_ADD_I32)
|
||||
: Opc)),
|
||||
RowBcast15)
|
||||
.addReg(DPPRowShr8)
|
||||
.addReg(SwizzledValue);
|
||||
if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr))
|
||||
ClampInstr.addImm(0);
|
||||
}
|
||||
FinalDPPResult = RowBcast15;
|
||||
if (!IsWave32) {
|
||||
if (ST.hasDPPBroadcasts()) {
|
||||
BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31);
|
||||
} else {
|
||||
Register ShiftedThreadID =
|
||||
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
Register PermuteByteOffset =
|
||||
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
Register PermutedValue =
|
||||
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
Register Lane32Offset = MRI.createVirtualRegister(DstRegClass);
|
||||
Register WordSizeConst = MRI.createVirtualRegister(DstRegClass);
|
||||
Register ThreadIDRegLo =
|
||||
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
Register ThreadIDReg =
|
||||
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
// Get the thread ID.
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
|
||||
ThreadIDRegLo)
|
||||
.addImm(-1)
|
||||
.addImm(0);
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
|
||||
ThreadIDReg)
|
||||
.addImm(-1)
|
||||
.addReg(ThreadIDRegLo);
|
||||
// shift each lane over by 32 positions, so value in 31st lane is
|
||||
// present in 63rd lane.
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
|
||||
.addImm(0x20);
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), ShiftedThreadID)
|
||||
.addReg(ThreadIDReg)
|
||||
.addReg(Lane32Offset)
|
||||
.addImm(0); // clamp
|
||||
// multiply by reg size.
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
|
||||
.addImm(0x4);
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),
|
||||
PermuteByteOffset)
|
||||
.addReg(WordSizeConst)
|
||||
.addReg(ShiftedThreadID);
|
||||
// Permute the lanes
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32), PermutedValue)
|
||||
.addReg(PermuteByteOffset) // addr
|
||||
.addReg(RowBcast15) // data
|
||||
.addImm(0); // offset
|
||||
auto ClampInstr =
|
||||
BuildMI(BB, MI, DL,
|
||||
TII->get(TII->getVALUOp(
|
||||
Opc == AMDGPU::S_SUB_I32
|
||||
? static_cast<unsigned>(AMDGPU::S_ADD_I32)
|
||||
: Opc)),
|
||||
RowBcast31)
|
||||
.addReg(RowBcast15)
|
||||
.addReg(PermutedValue);
|
||||
if (TII->hasIntClamp(*ClampInstr) || TII->hasFPClamp(*ClampInstr))
|
||||
ClampInstr.addImm(0);
|
||||
}
|
||||
FinalDPPResult = RowBcast31;
|
||||
}
|
||||
// The final reduced value is in the last lane.
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), ReducedValSGPR)
|
||||
.addReg(FinalDPPResult)
|
||||
.addImm(ST.getWavefrontSize() - 1);
|
||||
if (Opc == AMDGPU::S_SUB_I32)
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
|
||||
.addImm(0)
|
||||
.addReg(ReducedValSGPR);
|
||||
// Mark the final result as a whole-wave-mode calculation.
|
||||
BuildMI(BB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
|
||||
.addReg(Opc == AMDGPU::S_SUB_I32 ? NegatedReducedVal
|
||||
: ReducedValSGPR);
|
||||
RetBB = &BB;
|
||||
}
|
||||
// Manipulate the iterator to get the next active lane
|
||||
unsigned BITSETOpc =
|
||||
IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
|
||||
.addReg(FF1Reg)
|
||||
.addReg(ActiveBitsReg);
|
||||
|
||||
// Add phi nodes
|
||||
Accumulator.addReg(DstReg).addMBB(ComputeLoop);
|
||||
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
|
||||
|
||||
// Creating branching
|
||||
unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
|
||||
.addReg(NewActiveBitsReg)
|
||||
.addImm(0);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
|
||||
.addMBB(ComputeLoop);
|
||||
|
||||
RetBB = ComputeEnd;
|
||||
}
|
||||
MI.eraseFromParent();
|
||||
return RetBB;
|
||||
|
||||
@ -5948,11 +5948,21 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
|
||||
return true;
|
||||
}
|
||||
|
||||
unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
|
||||
if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
|
||||
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
|
||||
return MI.getOperand(1).isReg() || RI.isAGPR(MRI, MI.getOperand(0).getReg())
|
||||
? AMDGPU::COPY
|
||||
: AMDGPU::V_MOV_B32_e32;
|
||||
}
|
||||
return getVALUOp(MI.getOpcode());
|
||||
}
|
||||
|
||||
// It is more readable to list mapped opcodes on the same line.
|
||||
// clang-format off
|
||||
|
||||
unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
|
||||
switch (MI.getOpcode()) {
|
||||
unsigned SIInstrInfo::getVALUOp(unsigned Opc) const {
|
||||
switch (Opc) {
|
||||
default: return AMDGPU::INSTRUCTION_LIST_END;
|
||||
case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
|
||||
case AMDGPU::COPY: return AMDGPU::COPY;
|
||||
@ -5962,12 +5972,6 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
|
||||
case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
|
||||
case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
|
||||
case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
|
||||
case AMDGPU::S_MOV_B32: {
|
||||
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
|
||||
return MI.getOperand(1).isReg() ||
|
||||
RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
|
||||
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
|
||||
}
|
||||
case AMDGPU::S_ADD_I32:
|
||||
return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
|
||||
case AMDGPU::S_ADDC_U32:
|
||||
|
||||
@ -1361,6 +1361,7 @@ public:
|
||||
StringRef &ErrInfo) const override;
|
||||
|
||||
unsigned getVALUOp(const MachineInstr &MI) const;
|
||||
unsigned getVALUOp(unsigned Opc) const;
|
||||
|
||||
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator MBBI,
|
||||
|
||||
@ -354,7 +354,7 @@ multiclass
|
||||
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, UseNamedOperandTable = 1, Uses = [EXEC] in {
|
||||
def !toupper(Op) #"_PSEUDO_" #DataType
|
||||
: VPseudoInstSI<(outs RetReg : $sdst),
|
||||
(ins Reg : $src, VSrc_b32 : $strategy),
|
||||
(ins Reg : $src, i32imm : $strategy),
|
||||
[(set ty : $sdst, (!cast<AMDGPUWaveReduce>("int_amdgcn_wave_reduce_" #Op) ty : $src, i32 : $strategy))]> {}
|
||||
}
|
||||
}
|
||||
|
||||
@ -133,9 +133,9 @@ define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
|
||||
; GFX942-NEXT: s_mov_b32 s33, s32
|
||||
; GFX942-NEXT: s_add_i32 s32, s32, 16
|
||||
; GFX942-NEXT: s_and_b32 s0, s0, -16
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_lshl_b32 s0, s0, 6
|
||||
; GFX942-NEXT: s_mov_b32 s1, s32
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_add_i32 s32, s1, s0
|
||||
; GFX942-NEXT: scratch_store_dword off, v0, s1
|
||||
; GFX942-NEXT: s_endpgm
|
||||
@ -152,54 +152,74 @@ define amdgpu_cs_chain void @test_alloca_var(i32 %count) {
|
||||
; GFX12-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15
|
||||
; GFX12-NEXT: s_mov_b64 s[0:1], exec
|
||||
; GFX12-NEXT: s_mov_b32 s2, 0
|
||||
; GFX12-NEXT: v_lshl_add_u32 v3, v8, 2, 15
|
||||
; GFX12-NEXT: s_mov_b32 s33, s32
|
||||
; GFX12-NEXT: s_add_co_i32 s32, s32, 16
|
||||
; GFX12-NEXT: v_and_b32_e32 v1, -16, v0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_and_b32_e32 v3, -16, v3
|
||||
; GFX12-NEXT: s_or_saveexec_b64 s[0:1], -1
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_readlane_b32 s4, v1, s3
|
||||
; GFX12-NEXT: s_bitset0_b64 s[0:1], s3
|
||||
; GFX12-NEXT: s_max_u32 s2, s2, s4
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX12-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX12-NEXT: ; %bb.2:
|
||||
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v3, s[0:1]
|
||||
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v2
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: v_add_nc_u32_e32 v2, 32, v2
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: v_mul_lo_u32 v2, 4, v2
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
|
||||
; GFX12-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
|
||||
; GFX12-NEXT: ds_permute_b32 v1, v2, v0
|
||||
; GFX12-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_readlane_b32 s2, v0, 63
|
||||
; GFX12-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX12-NEXT: s_mov_b32 s0, s32
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_lshl_add_u32 v1, s2, 6, s0
|
||||
; GFX12-NEXT: scratch_store_b32 off, v0, s0
|
||||
; GFX12-NEXT: v_readfirstlane_b32 s32, v1
|
||||
; GFX12-NEXT: v_lshl_add_u32 v3, s2, 6, s0
|
||||
; GFX12-NEXT: scratch_store_b32 off, v4, s0
|
||||
; GFX12-NEXT: v_readfirstlane_b32 s32, v3
|
||||
; GFX12-NEXT: s_endpgm
|
||||
;
|
||||
; GFX942-LABEL: test_alloca_var:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15
|
||||
; GFX942-NEXT: v_and_b32_e32 v1, -16, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], exec
|
||||
; GFX942-NEXT: s_mov_b32 s2, 0
|
||||
; GFX942-NEXT: v_lshl_add_u32 v1, v8, 2, 15
|
||||
; GFX942-NEXT: s_mov_b32 s33, s32
|
||||
; GFX942-NEXT: s_add_i32 s32, s32, 16
|
||||
; GFX942-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX942-NEXT: v_readlane_b32 s4, v1, s3
|
||||
; GFX942-NEXT: s_bitset0_b64 s[0:1], s3
|
||||
; GFX942-NEXT: s_max_u32 s2, s2, s4
|
||||
; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX942-NEXT: s_cbranch_scc1 .LBB3_1
|
||||
; GFX942-NEXT: ; %bb.2:
|
||||
; GFX942-NEXT: v_and_b32_e32 v1, -16, v1
|
||||
; GFX942-NEXT: s_or_saveexec_b64 s[0:1], -1
|
||||
; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[0:1]
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_readlane_b32 s2, v0, 63
|
||||
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX942-NEXT: s_mov_b32 s0, s32
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1
|
||||
; GFX942-NEXT: scratch_store_dword off, v0, s0
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_readfirstlane_b32 s32, v1
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX942-NEXT: scratch_store_dword off, v1, s0
|
||||
; GFX942-NEXT: s_endpgm
|
||||
%v = alloca i32, i32 %count, align 4, addrspace(5)
|
||||
store i32 0, ptr addrspace(5) %v, align 4
|
||||
@ -302,8 +322,8 @@ define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count)
|
||||
; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4
|
||||
; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_mov_b32 s3, s32
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_add_i32 s32, s3, s2
|
||||
; GFX942-NEXT: scratch_store_dword off, v0, s3
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -323,35 +343,50 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
|
||||
; GFX12-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15
|
||||
; GFX12-NEXT: s_mov_b64 s[0:1], exec
|
||||
; GFX12-NEXT: s_mov_b32 s2, 0
|
||||
; GFX12-NEXT: v_lshl_add_u32 v3, v8, 2, 15
|
||||
; GFX12-NEXT: s_mov_b32 s33, s32
|
||||
; GFX12-NEXT: s_add_co_i32 s32, s32, 16
|
||||
; GFX12-NEXT: v_and_b32_e32 v1, -16, v0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_and_b32_e32 v3, -16, v3
|
||||
; GFX12-NEXT: s_or_saveexec_b64 s[0:1], -1
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v3, s[0:1]
|
||||
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
|
||||
; GFX12-NEXT: s_getpc_b64 s[2:3]
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_readlane_b32 s4, v1, s3
|
||||
; GFX12-NEXT: s_bitset0_b64 s[0:1], s3
|
||||
; GFX12-NEXT: s_max_u32 s2, s2, s4
|
||||
; GFX12-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX12-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX12-NEXT: ; %bb.2:
|
||||
; GFX12-NEXT: s_getpc_b64 s[0:1]
|
||||
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v2
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: v_add_nc_u32_e32 v2, 32, v2
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: v_mul_lo_u32 v2, 4, v2
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
|
||||
; GFX12-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
|
||||
; GFX12-NEXT: ds_permute_b32 v1, v2, v0
|
||||
; GFX12-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
|
||||
; GFX12-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
|
||||
; GFX12-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_readlane_b32 s4, v0, 63
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
|
||||
; GFX12-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX12-NEXT: s_mov_b32 s2, s32
|
||||
; GFX12-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
|
||||
; GFX12-NEXT: s_mov_b32 s3, s32
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
|
||||
; GFX12-NEXT: v_lshl_add_u32 v1, s2, 6, s3
|
||||
; GFX12-NEXT: scratch_store_b32 off, v0, s3
|
||||
; GFX12-NEXT: v_readfirstlane_b32 s32, v1
|
||||
; GFX12-NEXT: v_lshl_add_u32 v3, s4, 6, s2
|
||||
; GFX12-NEXT: scratch_store_b32 off, v4, s2
|
||||
; GFX12-NEXT: v_readfirstlane_b32 s32, v3
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
||||
; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
@ -360,21 +395,27 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
|
||||
; GFX942-LABEL: test_alloca_and_call_var:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15
|
||||
; GFX942-NEXT: v_and_b32_e32 v1, -16, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], exec
|
||||
; GFX942-NEXT: s_mov_b32 s2, 0
|
||||
; GFX942-NEXT: v_lshl_add_u32 v1, v8, 2, 15
|
||||
; GFX942-NEXT: s_mov_b32 s33, s32
|
||||
; GFX942-NEXT: s_add_i32 s32, s32, 16
|
||||
; GFX942-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX942-NEXT: v_readlane_b32 s4, v1, s3
|
||||
; GFX942-NEXT: s_bitset0_b64 s[0:1], s3
|
||||
; GFX942-NEXT: s_max_u32 s2, s2, s4
|
||||
; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX942-NEXT: s_cbranch_scc1 .LBB6_1
|
||||
; GFX942-NEXT: ; %bb.2:
|
||||
; GFX942-NEXT: v_and_b32_e32 v1, -16, v1
|
||||
; GFX942-NEXT: s_or_saveexec_b64 s[0:1], -1
|
||||
; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[0:1]
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_readlane_b32 s2, v0, 63
|
||||
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX942-NEXT: s_getpc_b64 s[0:1]
|
||||
; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4
|
||||
; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
|
||||
@ -382,8 +423,10 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
|
||||
; GFX942-NEXT: s_mov_b32 s3, s32
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1
|
||||
; GFX942-NEXT: scratch_store_dword off, v0, s3
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_readfirstlane_b32 s32, v1
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX942-NEXT: scratch_store_dword off, v1, s3
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX942-NEXT: s_endpgm
|
||||
@ -467,13 +510,13 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
|
||||
; GFX12-NEXT: s_mov_b32 s4, s32
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_lshl_b32 s0, s0, 6
|
||||
; GFX12-NEXT: v_mov_b32_e32 v40, 0
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_add_co_i32 s32, s4, s0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3]
|
||||
; GFX12-NEXT: scratch_store_b32 off, v40, s4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: scratch_store_b32 off, v0, s4
|
||||
; GFX12-NEXT: s_endpgm
|
||||
;
|
||||
; GFX942-LABEL: test_call_and_alloca_var_uniform:
|
||||
@ -489,11 +532,11 @@ define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count)
|
||||
; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX942-NEXT: s_mov_b32 s4, s32
|
||||
; GFX942-NEXT: v_mov_b32_e32 v40, 0
|
||||
; GFX942-NEXT: s_add_i32 s32, s4, s2
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX942-NEXT: scratch_store_dword off, v40, s4
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NEXT: scratch_store_dword off, v0, s4
|
||||
; GFX942-NEXT: s_endpgm
|
||||
%v = alloca i32, i32 %count, align 4, addrspace(5)
|
||||
call amdgpu_gfx void @foo()
|
||||
@ -509,71 +552,93 @@ define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) {
|
||||
; GFX12-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: v_lshl_add_u32 v0, v8, 2, 15
|
||||
; GFX12-NEXT: v_mov_b32_e32 v40, 0
|
||||
; GFX12-NEXT: s_mov_b64 s[0:1], exec
|
||||
; GFX12-NEXT: s_mov_b32 s2, 0
|
||||
; GFX12-NEXT: v_lshl_add_u32 v3, v8, 2, 15
|
||||
; GFX12-NEXT: s_mov_b32 s33, s32
|
||||
; GFX12-NEXT: v_and_b32_e32 v0, -16, v0
|
||||
; GFX12-NEXT: s_add_co_i32 s32, s32, 16
|
||||
; GFX12-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_and_b32_e32 v3, -16, v3
|
||||
; GFX12-NEXT: s_or_saveexec_b64 s[0:1], -1
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_ctz_i32_b64 s3, s[0:1]
|
||||
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, v3, s[0:1]
|
||||
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
|
||||
; GFX12-NEXT: s_getpc_b64 s[2:3]
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_readlane_b32 s4, v0, s3
|
||||
; GFX12-NEXT: s_bitset0_b64 s[0:1], s3
|
||||
; GFX12-NEXT: s_max_u32 s2, s2, s4
|
||||
; GFX12-NEXT: s_sext_i32_i16 s3, s3
|
||||
; GFX12-NEXT: s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX12-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX12-NEXT: ; %bb.2:
|
||||
; GFX12-NEXT: s_getpc_b64 s[0:1]
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_sext_i32_i16 s1, s1
|
||||
; GFX12-NEXT: s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
|
||||
; GFX12-NEXT: s_mov_b32 s4, s32
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
|
||||
; GFX12-NEXT: v_lshl_add_u32 v0, s2, 6, s4
|
||||
; GFX12-NEXT: s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: v_mbcnt_hi_u32_b32 v2, -1, v2
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: v_add_nc_u32_e32 v2, 32, v2
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: v_mul_lo_u32 v2, 4, v2
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
|
||||
; GFX12-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
|
||||
; GFX12-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
|
||||
; GFX12-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
|
||||
; GFX12-NEXT: ds_permute_b32 v1, v2, v0
|
||||
; GFX12-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-NEXT: v_max_u32_e32 v0, v0, v1
|
||||
; GFX12-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
|
||||
; GFX12-NEXT: s_or_saveexec_b64 s[2:3], -1
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_readfirstlane_b32 s32, v0
|
||||
; GFX12-NEXT: v_readlane_b32 s4, v0, 63
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX12-NEXT: s_mov_b32 s5, s32
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_lshl_add_u32 v3, s4, 6, s5
|
||||
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX12-NEXT: v_readfirstlane_b32 s32, v3
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
||||
; GFX12-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX12-NEXT: scratch_store_b32 off, v40, s4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX12-NEXT: scratch_store_b32 off, v3, s5
|
||||
; GFX12-NEXT: s_endpgm
|
||||
;
|
||||
; GFX942-LABEL: test_call_and_alloca_var:
|
||||
; GFX942: ; %bb.0:
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX942-NEXT: v_lshl_add_u32 v0, v8, 2, 15
|
||||
; GFX942-NEXT: v_and_b32_e32 v0, -16, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v40, 0
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], exec
|
||||
; GFX942-NEXT: s_mov_b32 s2, 0
|
||||
; GFX942-NEXT: v_lshl_add_u32 v1, v8, 2, 15
|
||||
; GFX942-NEXT: s_mov_b32 s33, s32
|
||||
; GFX942-NEXT: s_add_i32 s32, s32, 16
|
||||
; GFX942-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
|
||||
; GFX942-NEXT: v_readlane_b32 s4, v0, s3
|
||||
; GFX942-NEXT: s_bitset0_b64 s[0:1], s3
|
||||
; GFX942-NEXT: s_max_u32 s2, s2, s4
|
||||
; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
|
||||
; GFX942-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GFX942-NEXT: ; %bb.2:
|
||||
; GFX942-NEXT: v_and_b32_e32 v1, -16, v1
|
||||
; GFX942-NEXT: s_or_saveexec_b64 s[0:1], -1
|
||||
; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[0:1]
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0xf bank_mask:0xf
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_readlane_b32 s2, v0, 63
|
||||
; GFX942-NEXT: s_mov_b64 exec, s[0:1]
|
||||
; GFX942-NEXT: s_getpc_b64 s[0:1]
|
||||
; GFX942-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4
|
||||
; GFX942-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
|
||||
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
|
||||
; GFX942-NEXT: s_mov_b32 s4, s32
|
||||
; GFX942-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX942-NEXT: v_lshl_add_u32 v0, s2, 6, v0
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX942-NEXT: v_lshl_add_u32 v1, s2, 6, v1
|
||||
; GFX942-NEXT: s_nop 0
|
||||
; GFX942-NEXT: v_readfirstlane_b32 s32, v0
|
||||
; GFX942-NEXT: v_readfirstlane_b32 s32, v1
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX942-NEXT: scratch_store_dword off, v40, s4
|
||||
; GFX942-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX942-NEXT: scratch_store_dword off, v1, s4
|
||||
; GFX942-NEXT: s_endpgm
|
||||
%v = alloca i32, i32 %count, align 4, addrspace(5)
|
||||
call amdgpu_gfx void @foo()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -317,34 +317,50 @@ define amdgpu_gfx ptr addrspace(5) @sponentry_gfx_dyn_alloc(i32 %val) #0 {
|
||||
; DAGISEL-NEXT: s_wait_samplecnt 0x0
|
||||
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
|
||||
; DAGISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; DAGISEL-NEXT: v_lshl_add_u32 v1, v0, 2, 15
|
||||
; DAGISEL-NEXT: s_mov_b32 s34, s33
|
||||
; DAGISEL-NEXT: s_mov_b32 s1, exec_lo
|
||||
; DAGISEL-NEXT: s_mov_b32 s0, 0
|
||||
; DAGISEL-NEXT: s_mov_b32 s2, s33
|
||||
; DAGISEL-NEXT: s_mov_b32 s33, s32
|
||||
; DAGISEL-NEXT: v_and_b32_e32 v1, -16, v1
|
||||
; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
|
||||
; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill
|
||||
; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:4
|
||||
; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:8
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
|
||||
; DAGISEL-NEXT: v_lshl_add_u32 v3, v0, 2, 15
|
||||
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
|
||||
; DAGISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; DAGISEL-NEXT: v_and_b32_e32 v3, -16, v3
|
||||
; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s0
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
|
||||
; DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
|
||||
; DAGISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
|
||||
; DAGISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
|
||||
; DAGISEL-NEXT: s_wait_dscnt 0x0
|
||||
; DAGISEL-NEXT: v_max_u32_e32 v1, v1, v2
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; DAGISEL-NEXT: v_readlane_b32 s1, v1, 31
|
||||
; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
|
||||
; DAGISEL-NEXT: s_mov_b32 s0, s32
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; DAGISEL-NEXT: s_bitset0_b32 s1, s2
|
||||
; DAGISEL-NEXT: s_max_u32 s0, s0, s3
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; DAGISEL-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; DAGISEL-NEXT: ; %bb.2:
|
||||
; DAGISEL-NEXT: s_mov_b32 s1, s32
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: v_lshl_add_u32 v1, s0, 5, s1
|
||||
; DAGISEL-NEXT: v_lshl_add_u32 v3, s1, 5, s0
|
||||
; DAGISEL-NEXT: s_wait_storecnt 0x0
|
||||
; DAGISEL-NEXT: scratch_store_b32 off, v0, s1 scope:SCOPE_SYS
|
||||
; DAGISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
|
||||
; DAGISEL-NEXT: s_wait_storecnt 0x0
|
||||
; DAGISEL-NEXT: v_mov_b32_e32 v0, s33
|
||||
; DAGISEL-NEXT: v_readfirstlane_b32 s32, v1
|
||||
; DAGISEL-NEXT: v_readfirstlane_b32 s32, v3
|
||||
; DAGISEL-NEXT: s_mov_b32 s32, s33
|
||||
; DAGISEL-NEXT: s_mov_b32 s33, s34
|
||||
; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
|
||||
; DAGISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload
|
||||
; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:4
|
||||
; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:8
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
|
||||
; DAGISEL-NEXT: s_mov_b32 s33, s2
|
||||
; DAGISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -355,34 +371,51 @@ define amdgpu_gfx ptr addrspace(5) @sponentry_gfx_dyn_alloc(i32 %val) #0 {
|
||||
; GISEL-NEXT: s_wait_samplecnt 0x0
|
||||
; GISEL-NEXT: s_wait_bvhcnt 0x0
|
||||
; GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GISEL-NEXT: v_lshl_add_u32 v1, v0, 2, 15
|
||||
; GISEL-NEXT: s_mov_b32 s34, s33
|
||||
; GISEL-NEXT: s_mov_b32 s1, exec_lo
|
||||
; GISEL-NEXT: s_mov_b32 s0, 0
|
||||
; GISEL-NEXT: s_mov_b32 s3, s33
|
||||
; GISEL-NEXT: s_mov_b32 s33, s32
|
||||
; GISEL-NEXT: v_and_b32_e32 v1, -16, v1
|
||||
; GISEL-NEXT: s_xor_saveexec_b32 s0, -1
|
||||
; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Spill
|
||||
; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:4
|
||||
; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:8
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_mov_b32 exec_lo, s0
|
||||
; GISEL-NEXT: v_lshl_add_u32 v3, v0, 2, 15
|
||||
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
|
||||
; GISEL-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GISEL-NEXT: s_mov_b32 s0, s32
|
||||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; GISEL-NEXT: v_and_b32_e32 v3, -16, v3
|
||||
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: v_readlane_b32 s3, v1, s2
|
||||
; GISEL-NEXT: s_bitset0_b32 s1, s2
|
||||
; GISEL-NEXT: s_max_u32 s0, s0, s3
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GISEL-NEXT: s_cbranch_scc1 .LBB9_1
|
||||
; GISEL-NEXT: ; %bb.2:
|
||||
; GISEL-NEXT: s_mov_b32 s1, s32
|
||||
; GISEL-NEXT: s_lshl_b32 s0, s0, 5
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, v3, s1
|
||||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
|
||||
; GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
|
||||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
|
||||
; GISEL-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
|
||||
; GISEL-NEXT: ds_swizzle_b32 v2, v1 offset:swizzle(BROADCAST,32,15)
|
||||
; GISEL-NEXT: s_wait_dscnt 0x0
|
||||
; GISEL-NEXT: v_max_u32_e32 v1, v1, v2
|
||||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GISEL-NEXT: v_readlane_b32 s2, v1, 31
|
||||
; GISEL-NEXT: s_mov_b32 exec_lo, s1
|
||||
; GISEL-NEXT: s_lshl_b32 s1, s2, 5
|
||||
; GISEL-NEXT: s_wait_storecnt 0x0
|
||||
; GISEL-NEXT: scratch_store_b32 off, v0, s1 scope:SCOPE_SYS
|
||||
; GISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
|
||||
; GISEL-NEXT: s_wait_storecnt 0x0
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_add_co_u32 s32, s0, s1
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, s33
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_add_co_u32 s32, s1, s0
|
||||
; GISEL-NEXT: s_mov_b32 s32, s33
|
||||
; GISEL-NEXT: s_mov_b32 s33, s34
|
||||
; GISEL-NEXT: s_xor_saveexec_b32 s0, -1
|
||||
; GISEL-NEXT: s_clause 0x1 ; 8-byte Folded Reload
|
||||
; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:4
|
||||
; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:8
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_mov_b32 exec_lo, s0
|
||||
; GISEL-NEXT: s_mov_b32 s33, s3
|
||||
; GISEL-NEXT: s_wait_loadcnt 0x0
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_setpc_b64 s[30:31]
|
||||
%local = alloca i32, i32 %val, addrspace(5)
|
||||
@ -499,34 +532,36 @@ define amdgpu_cs_chain void @sponentry_cs_chain_dyn_alloc(i32 %val) #0 {
|
||||
; DAGISEL-NEXT: s_wait_samplecnt 0x0
|
||||
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
|
||||
; DAGISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; DAGISEL-NEXT: v_lshl_add_u32 v0, v8, 2, 15
|
||||
; DAGISEL-NEXT: s_mov_b32 s1, exec_lo
|
||||
; DAGISEL-NEXT: s_mov_b32 s0, 0
|
||||
; DAGISEL-NEXT: v_lshl_add_u32 v2, v8, 2, 15
|
||||
; DAGISEL-NEXT: s_mov_b32 s33, s32
|
||||
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
|
||||
; DAGISEL-NEXT: v_and_b32_e32 v0, -16, v0
|
||||
; DAGISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
|
||||
; DAGISEL-NEXT: v_and_b32_e32 v2, -16, v2
|
||||
; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: v_cndmask_b32_e64 v0, 0, v2, s0
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; DAGISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
|
||||
; DAGISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; DAGISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
|
||||
; DAGISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
|
||||
; DAGISEL-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
|
||||
; DAGISEL-NEXT: s_wait_dscnt 0x0
|
||||
; DAGISEL-NEXT: v_max_u32_e32 v0, v0, v1
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; DAGISEL-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; DAGISEL-NEXT: s_bitset0_b32 s1, s2
|
||||
; DAGISEL-NEXT: s_max_u32 s0, s0, s3
|
||||
; DAGISEL-NEXT: v_readlane_b32 s1, v0, 31
|
||||
; DAGISEL-NEXT: s_mov_b32 exec_lo, s0
|
||||
; DAGISEL-NEXT: s_mov_b32 s0, s32
|
||||
; DAGISEL-NEXT: v_mov_b32_e32 v3, s33
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; DAGISEL-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; DAGISEL-NEXT: ; %bb.2:
|
||||
; DAGISEL-NEXT: s_mov_b32 s1, s32
|
||||
; DAGISEL-NEXT: v_mov_b32_e32 v1, s33
|
||||
; DAGISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; DAGISEL-NEXT: v_lshl_add_u32 v0, s0, 5, s1
|
||||
; DAGISEL-NEXT: v_lshl_add_u32 v2, s1, 5, s0
|
||||
; DAGISEL-NEXT: s_wait_storecnt 0x0
|
||||
; DAGISEL-NEXT: scratch_store_b32 off, v8, s1 scope:SCOPE_SYS
|
||||
; DAGISEL-NEXT: scratch_store_b32 off, v8, s0 scope:SCOPE_SYS
|
||||
; DAGISEL-NEXT: s_wait_storecnt 0x0
|
||||
; DAGISEL-NEXT: scratch_store_b32 off, v1, s1 scope:SCOPE_SYS
|
||||
; DAGISEL-NEXT: scratch_store_b32 off, v3, s0 scope:SCOPE_SYS
|
||||
; DAGISEL-NEXT: s_wait_storecnt 0x0
|
||||
; DAGISEL-NEXT: v_readfirstlane_b32 s32, v0
|
||||
; DAGISEL-NEXT: v_readfirstlane_b32 s32, v2
|
||||
; DAGISEL-NEXT: s_alloc_vgpr 0
|
||||
; DAGISEL-NEXT: s_endpgm
|
||||
;
|
||||
@ -537,34 +572,36 @@ define amdgpu_cs_chain void @sponentry_cs_chain_dyn_alloc(i32 %val) #0 {
|
||||
; GISEL-NEXT: s_wait_samplecnt 0x0
|
||||
; GISEL-NEXT: s_wait_bvhcnt 0x0
|
||||
; GISEL-NEXT: s_wait_kmcnt 0x0
|
||||
; GISEL-NEXT: v_lshl_add_u32 v0, v8, 2, 15
|
||||
; GISEL-NEXT: s_mov_b32 s1, exec_lo
|
||||
; GISEL-NEXT: s_mov_b32 s0, 0
|
||||
; GISEL-NEXT: v_lshl_add_u32 v2, v8, 2, 15
|
||||
; GISEL-NEXT: s_mov_b32 s33, s32
|
||||
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, -16, v0
|
||||
; GISEL-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_ctz_i32_b32 s2, s1
|
||||
; GISEL-NEXT: s_mov_b32 s0, s32
|
||||
; GISEL-NEXT: v_and_b32_e32 v2, -16, v2
|
||||
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GISEL-NEXT: v_readlane_b32 s3, v0, s2
|
||||
; GISEL-NEXT: s_bitset0_b32 s1, s2
|
||||
; GISEL-NEXT: s_max_u32 s0, s0, s3
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GISEL-NEXT: s_cbranch_scc1 .LBB12_1
|
||||
; GISEL-NEXT: ; %bb.2:
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, s33
|
||||
; GISEL-NEXT: s_mov_b32 s1, s32
|
||||
; GISEL-NEXT: s_lshl_b32 s0, s0, 5
|
||||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, v2, s1
|
||||
; GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0xf bank_mask:0xf
|
||||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:2 row_mask:0xf bank_mask:0xf
|
||||
; GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:4 row_mask:0xf bank_mask:0xf
|
||||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
||||
; GISEL-NEXT: v_max_u32_dpp v0, v0, v0 row_shr:8 row_mask:0xf bank_mask:0xf
|
||||
; GISEL-NEXT: ds_swizzle_b32 v1, v0 offset:swizzle(BROADCAST,32,15)
|
||||
; GISEL-NEXT: s_wait_dscnt 0x0
|
||||
; GISEL-NEXT: v_max_u32_e32 v0, v0, v1
|
||||
; GISEL-NEXT: v_readlane_b32 s2, v0, 31
|
||||
; GISEL-NEXT: s_mov_b32 exec_lo, s1
|
||||
; GISEL-NEXT: v_mov_b32_e32 v2, s33
|
||||
; GISEL-NEXT: s_lshl_b32 s1, s2, 5
|
||||
; GISEL-NEXT: s_wait_storecnt 0x0
|
||||
; GISEL-NEXT: scratch_store_b32 off, v8, s1 scope:SCOPE_SYS
|
||||
; GISEL-NEXT: s_wait_storecnt 0x0
|
||||
; GISEL-NEXT: scratch_store_b32 off, v0, s1 scope:SCOPE_SYS
|
||||
; GISEL-NEXT: scratch_store_b32 off, v8, s0 scope:SCOPE_SYS
|
||||
; GISEL-NEXT: s_wait_storecnt 0x0
|
||||
; GISEL-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GISEL-NEXT: s_add_co_u32 s32, s1, s0
|
||||
; GISEL-NEXT: s_add_co_u32 s32, s0, s1
|
||||
; GISEL-NEXT: scratch_store_b32 off, v2, s0 scope:SCOPE_SYS
|
||||
; GISEL-NEXT: s_wait_storecnt 0x0
|
||||
; GISEL-NEXT: s_alloc_vgpr 0
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%local = alloca i32, i32 %val, addrspace(5)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user