Adding helper function for expanding arithmetic ops.
This commit is contained in:
parent
163ae0d91e
commit
e5007647e5
@ -5192,6 +5192,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
|
||||
return LoopBB;
|
||||
}
|
||||
|
||||
static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) {
|
||||
// For targets older than GFX12, we emit a sequence of 32-bit operations.
|
||||
// For GFX12, we emit s_add_u64 and s_sub_u64.
|
||||
MachineFunction *MF = BB->getParent();
|
||||
const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
|
||||
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
||||
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
|
||||
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
MachineOperand &Dest = MI.getOperand(0);
|
||||
MachineOperand &Src0 = MI.getOperand(1);
|
||||
MachineOperand &Src1 = MI.getOperand(2);
|
||||
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
|
||||
if (ST.hasScalarAddSub64()) {
|
||||
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
|
||||
// clang-format off
|
||||
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
|
||||
.add(Src0)
|
||||
.add(Src1);
|
||||
// clang-format on
|
||||
} else {
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
|
||||
|
||||
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
||||
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
||||
|
||||
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
|
||||
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
|
||||
|
||||
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
|
||||
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
|
||||
|
||||
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
|
||||
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
|
||||
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
|
||||
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
|
||||
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
|
||||
.addReg(DestSub0)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(DestSub1)
|
||||
.addImm(AMDGPU::sub1);
|
||||
}
|
||||
MI.eraseFromParent();
|
||||
return BB;
|
||||
}
|
||||
|
||||
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
|
||||
switch (Opc) {
|
||||
case AMDGPU::S_MIN_U32:
|
||||
@ -5552,43 +5604,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
|
||||
}
|
||||
case AMDGPU::S_ADD_U64_PSEUDO:
|
||||
case AMDGPU::S_SUB_U64_PSEUDO: {
|
||||
if (ST.hasScalarAddSub64()) {
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
|
||||
? AMDGPU::S_ADD_U64
|
||||
: AMDGPU::S_SUB_U64),
|
||||
DstReg)
|
||||
.addReg(Accumulator->getOperand(0).getReg())
|
||||
.addReg(LaneValue->getOperand(0).getReg());
|
||||
} else {
|
||||
unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
|
||||
? AMDGPU::S_ADD_U32
|
||||
: AMDGPU::S_SUB_U32;
|
||||
unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
|
||||
? AMDGPU::S_ADDC_U32
|
||||
: AMDGPU::S_SUBB_U32;
|
||||
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
||||
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
||||
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
|
||||
&AMDGPU::SReg_32RegClass);
|
||||
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
|
||||
&AMDGPU::SReg_32RegClass);
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
|
||||
.add(Accumlo)
|
||||
.addReg(LaneValueLo->getOperand(0).getReg());
|
||||
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
|
||||
.add(Accumhi)
|
||||
.addReg(LaneValueHi->getOperand(0).getReg())
|
||||
.setOperandDead(3); // Dead scc
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
|
||||
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
|
||||
.addReg(DestLo)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(DestHi)
|
||||
.addImm(AMDGPU::sub1);
|
||||
}
|
||||
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
|
||||
.addReg(Accumulator->getOperand(0).getReg())
|
||||
.addReg(LaneValue->getOperand(0).getReg());
|
||||
ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -5601,8 +5620,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
|
||||
.addReg(ActiveBitsReg);
|
||||
|
||||
// Add phi nodes
|
||||
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
|
||||
.addMBB(ComputeLoop);
|
||||
Accumulator.addReg(DstReg).addMBB(ComputeLoop);
|
||||
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
|
||||
|
||||
// Creating branching
|
||||
@ -5684,55 +5702,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
|
||||
}
|
||||
case AMDGPU::S_ADD_U64_PSEUDO:
|
||||
case AMDGPU::S_SUB_U64_PSEUDO: {
|
||||
// For targets older than GFX12, we emit a sequence of 32-bit operations.
|
||||
// For GFX12, we emit s_add_u64 and s_sub_u64.
|
||||
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
|
||||
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
MachineOperand &Dest = MI.getOperand(0);
|
||||
MachineOperand &Src0 = MI.getOperand(1);
|
||||
MachineOperand &Src1 = MI.getOperand(2);
|
||||
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
|
||||
if (Subtarget->hasScalarAddSub64()) {
|
||||
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
|
||||
// clang-format off
|
||||
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
|
||||
.add(Src0)
|
||||
.add(Src1);
|
||||
// clang-format on
|
||||
} else {
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
|
||||
|
||||
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
||||
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
|
||||
|
||||
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
|
||||
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
|
||||
|
||||
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
|
||||
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
|
||||
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
|
||||
|
||||
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
|
||||
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
|
||||
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
|
||||
.add(Src0Sub0)
|
||||
.add(Src1Sub0);
|
||||
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
|
||||
.add(Src0Sub1)
|
||||
.add(Src1Sub1);
|
||||
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
|
||||
.addReg(DestSub0)
|
||||
.addImm(AMDGPU::sub0)
|
||||
.addReg(DestSub1)
|
||||
.addImm(AMDGPU::sub1);
|
||||
}
|
||||
MI.eraseFromParent();
|
||||
return BB;
|
||||
return Expand64BitScalarArithmetic(MI, BB);
|
||||
}
|
||||
case AMDGPU::V_ADD_U64_PSEUDO:
|
||||
case AMDGPU::V_SUB_U64_PSEUDO: {
|
||||
|
Loading…
x
Reference in New Issue
Block a user