Adding helper function for expanding arithmetic ops.

This commit is contained in:
Aaditya 2025-08-21 14:58:43 +05:30
parent 163ae0d91e
commit e5007647e5

View File

@ -5192,6 +5192,58 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
static MachineBasicBlock *Expand64BitScalarArithmetic(MachineInstr &MI,
MachineBasicBlock *BB) {
// For targets older than GFX12, we emit a sequence of 32-bit operations.
// For GFX12, we emit s_add_u64 and s_sub_u64.
MachineFunction *MF = BB->getParent();
const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
if (ST.hasScalarAddSub64()) {
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
// clang-format off
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
.add(Src0)
.add(Src1);
// clang-format on
} else {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);
}
MI.eraseFromParent();
return BB;
}
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc) {
switch (Opc) {
case AMDGPU::S_MIN_U32:
@ -5552,43 +5604,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
if (ST.hasScalarAddSub64()) {
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
TII->get(Opc == AMDGPU::S_ADD_U64_PSEUDO
? AMDGPU::S_ADD_U64
: AMDGPU::S_SUB_U64),
DstReg)
.addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValue->getOperand(0).getReg());
} else {
unsigned NewOpc1 = Opc == AMDGPU::S_ADD_U64_PSEUDO
? AMDGPU::S_ADD_U32
: AMDGPU::S_SUB_U32;
unsigned NewOpc2 = Opc == AMDGPU::S_ADD_U64_PSEUDO
? AMDGPU::S_ADDC_U32
: AMDGPU::S_SUBB_U32;
Register DestLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineOperand Accumlo = TII->buildExtractSubRegOrImm(
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub0,
&AMDGPU::SReg_32RegClass);
MachineOperand Accumhi = TII->buildExtractSubRegOrImm(
MI, MRI, Accumulator->getOperand(0), DstRegClass, AMDGPU::sub1,
&AMDGPU::SReg_32RegClass);
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc1), DestLo)
.add(Accumlo)
.addReg(LaneValueLo->getOperand(0).getReg());
BuildMI(*ComputeLoop, I, DL, TII->get(NewOpc2), DestHi)
.add(Accumhi)
.addReg(LaneValueHi->getOperand(0).getReg())
.setOperandDead(3); // Dead scc
NewAccumulator = BuildMI(*ComputeLoop, I, DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
.addReg(DestLo)
.addImm(AMDGPU::sub0)
.addReg(DestHi)
.addImm(AMDGPU::sub1);
}
NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
.addReg(Accumulator->getOperand(0).getReg())
.addReg(LaneValue->getOperand(0).getReg());
ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
break;
}
}
@ -5601,8 +5620,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
.addReg(ActiveBitsReg);
// Add phi nodes
Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
.addMBB(ComputeLoop);
Accumulator.addReg(DstReg).addMBB(ComputeLoop);
ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
// Creating branching
@ -5684,55 +5702,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
// For targets older than GFX12, we emit a sequence of 32-bit operations.
// For GFX12, we emit s_add_u64 and s_sub_u64.
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
if (Subtarget->hasScalarAddSub64()) {
unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
// clang-format off
BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
.add(Src0)
.add(Src1);
// clang-format on
} else {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
.add(Src0Sub0)
.add(Src1Sub0);
BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
.add(Src0Sub1)
.add(Src1Sub1);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
.addReg(DestSub1)
.addImm(AMDGPU::sub1);
}
MI.eraseFromParent();
return BB;
return Expand64BitScalarArithmetic(MI, BB);
}
case AMDGPU::V_ADD_U64_PSEUDO:
case AMDGPU::V_SUB_U64_PSEUDO: {