llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
Mirko Brkušanin 80f3b376b3
[AMDGPU][GlobalISel] Combine for breaking s64 and/or into two s32 insts (#151731)
When either one of the operands is all ones in high or low parts,
splitting these opens up other opportunities for combines. One of two
new instructions will either be removed or become a simple copy.
2025-08-20 17:32:29 +02:00

534 lines
18 KiB
C++

//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AMDGPUCombinerHelper.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
using namespace MIPatternMatch;
AMDGPUCombinerHelper::AMDGPUCombinerHelper(
GISelChangeObserver &Observer, MachineIRBuilder &B, bool IsPreLegalize,
GISelValueTracking *VT, MachineDominatorTree *MDT, const LegalizerInfo *LI,
const GCNSubtarget &STI)
: CombinerHelper(Observer, B, IsPreLegalize, VT, MDT, LI), STI(STI),
TII(*STI.getInstrInfo()) {}
LLVM_READNONE
static bool fnegFoldsIntoMI(const MachineInstr &MI) {
switch (MI.getOpcode()) {
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
case AMDGPU::G_FMAD:
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_FSIN:
case AMDGPU::G_FPEXT:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_FPTRUNC:
case AMDGPU::G_FRINT:
case AMDGPU::G_FNEARBYINT:
case AMDGPU::G_INTRINSIC_ROUND:
case AMDGPU::G_INTRINSIC_ROUNDEVEN:
case AMDGPU::G_FCANONICALIZE:
case AMDGPU::G_AMDGPU_RCP_IFLAG:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
return true;
case AMDGPU::G_INTRINSIC: {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_sin:
case Intrinsic::amdgcn_fmul_legacy:
case Intrinsic::amdgcn_fmed3:
case Intrinsic::amdgcn_fma_legacy:
return true;
default:
return false;
}
}
default:
return false;
}
}
/// \p returns true if the operation will definitely need to use a 64-bit
/// encoding, and thus will use a VOP3 encoding regardless of the source
/// modifiers.
LLVM_READONLY
static bool opMustUseVOP3Encoding(const MachineInstr &MI,
const MachineRegisterInfo &MRI) {
return MI.getNumOperands() > (isa<GIntrinsic>(MI) ? 4u : 3u) ||
MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
}
// Most FP instructions support source modifiers.
LLVM_READONLY
static bool hasSourceMods(const MachineInstr &MI) {
if (!MI.memoperands().empty())
return false;
switch (MI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::G_SELECT:
case AMDGPU::G_FDIV:
case AMDGPU::G_FREM:
case TargetOpcode::INLINEASM:
case TargetOpcode::INLINEASM_BR:
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
case AMDGPU::G_BITCAST:
case AMDGPU::G_ANYEXT:
case AMDGPU::G_BUILD_VECTOR:
case AMDGPU::G_BUILD_VECTOR_TRUNC:
case AMDGPU::G_PHI:
return false;
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_interp_p1:
case Intrinsic::amdgcn_interp_p2:
case Intrinsic::amdgcn_interp_mov:
case Intrinsic::amdgcn_interp_p1_f16:
case Intrinsic::amdgcn_interp_p2_f16:
case Intrinsic::amdgcn_div_scale:
return false;
default:
return true;
}
}
default:
return true;
}
}
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
unsigned CostThreshold = 4) {
// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
// it is truly free to use a source modifier in all cases. If there are
// multiple users but for each one will necessitate using VOP3, there will be
// a code size increase. Try to avoid increasing code size unless we know it
// will save on the instruction count.
unsigned NumMayIncreaseSize = 0;
Register Dst = MI.getOperand(0).getReg();
for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
if (!hasSourceMods(Use))
return false;
if (!opMustUseVOP3Encoding(Use, MRI)) {
if (++NumMayIncreaseSize > CostThreshold)
return false;
}
}
return true;
}
static bool mayIgnoreSignedZero(MachineInstr &MI) {
const TargetOptions &Options = MI.getMF()->getTarget().Options;
return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
}
static bool isInv2Pi(const APFloat &APF) {
static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
static const APFloat KF64(APFloat::IEEEdouble(),
APInt(64, 0x3fc45f306dc9c882));
return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||
APF.bitwiseIsEqual(KF64);
}
// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
// additional cost to negate them.
static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
MachineRegisterInfo &MRI) {
std::optional<FPValueAndVReg> FPValReg;
if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
return true;
const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
return true;
}
return false;
}
static unsigned inverseMinMax(unsigned Opc) {
switch (Opc) {
case AMDGPU::G_FMAXNUM:
return AMDGPU::G_FMINNUM;
case AMDGPU::G_FMINNUM:
return AMDGPU::G_FMAXNUM;
case AMDGPU::G_FMAXNUM_IEEE:
return AMDGPU::G_FMINNUM_IEEE;
case AMDGPU::G_FMINNUM_IEEE:
return AMDGPU::G_FMAXNUM_IEEE;
case AMDGPU::G_FMAXIMUM:
return AMDGPU::G_FMINIMUM;
case AMDGPU::G_FMINIMUM:
return AMDGPU::G_FMAXIMUM;
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
return AMDGPU::G_AMDGPU_FMIN_LEGACY;
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
return AMDGPU::G_AMDGPU_FMAX_LEGACY;
default:
llvm_unreachable("invalid min/max opcode");
}
}
bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
MachineInstr *&MatchInfo) const {
Register Src = MI.getOperand(1).getReg();
MatchInfo = MRI.getVRegDef(Src);
// If the input has multiple uses and we can either fold the negate down, or
// the other uses cannot, give up. This both prevents unprofitable
// transformations and infinite loops: we won't repeatedly try to fold around
// a negate that has no 'good' form.
if (MRI.hasOneNonDBGUse(Src)) {
if (allUsesHaveSourceMods(MI, MRI, 0))
return false;
} else {
if (fnegFoldsIntoMI(*MatchInfo) &&
(allUsesHaveSourceMods(MI, MRI) ||
!allUsesHaveSourceMods(*MatchInfo, MRI)))
return false;
}
switch (MatchInfo->getOpcode()) {
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
// 0 doesn't have a negated inline immediate.
return !isConstantCostlierToNegate(*MatchInfo,
MatchInfo->getOperand(2).getReg(), MRI);
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FMA:
case AMDGPU::G_FMAD:
return mayIgnoreSignedZero(*MatchInfo);
case AMDGPU::G_FMUL:
case AMDGPU::G_FPEXT:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_FPTRUNC:
case AMDGPU::G_FRINT:
case AMDGPU::G_FNEARBYINT:
case AMDGPU::G_INTRINSIC_ROUND:
case AMDGPU::G_INTRINSIC_ROUNDEVEN:
case AMDGPU::G_FSIN:
case AMDGPU::G_FCANONICALIZE:
case AMDGPU::G_AMDGPU_RCP_IFLAG:
return true;
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_sin:
case Intrinsic::amdgcn_fmul_legacy:
case Intrinsic::amdgcn_fmed3:
return true;
case Intrinsic::amdgcn_fma_legacy:
return mayIgnoreSignedZero(*MatchInfo);
default:
return false;
}
}
default:
return false;
}
}
void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
MachineInstr *&MatchInfo) const {
// Transform:
// %A = inst %Op1, ...
// %B = fneg %A
//
// into:
//
// (if %A has one use, specifically fneg above)
// %B = inst (maybe fneg %Op1), ...
//
// (if %A has multiple uses)
// %B = inst (maybe fneg %Op1), ...
// %A = fneg %B
// Replace register in operand with a register holding negated value.
auto NegateOperand = [&](MachineOperand &Op) {
Register Reg = Op.getReg();
if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
replaceRegOpWith(MRI, Op, Reg);
};
// Replace either register in operands with a register holding negated value.
auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
Register XReg = X.getReg();
Register YReg = Y.getReg();
if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
replaceRegOpWith(MRI, X, XReg);
else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
replaceRegOpWith(MRI, Y, YReg);
else {
YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
replaceRegOpWith(MRI, Y, YReg);
}
};
Builder.setInstrAndDebugLoc(*MatchInfo);
// Negate appropriate operands so that resulting value of MatchInfo is
// negated.
switch (MatchInfo->getOpcode()) {
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
NegateOperand(MatchInfo->getOperand(1));
NegateOperand(MatchInfo->getOperand(2));
break;
case AMDGPU::G_FMUL:
NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
break;
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
NegateOperand(MatchInfo->getOperand(1));
NegateOperand(MatchInfo->getOperand(2));
unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
replaceOpcodeWith(*MatchInfo, Opposite);
break;
}
case AMDGPU::G_FMA:
case AMDGPU::G_FMAD:
NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
NegateOperand(MatchInfo->getOperand(3));
break;
case AMDGPU::G_FPEXT:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_FRINT:
case AMDGPU::G_FNEARBYINT:
case AMDGPU::G_INTRINSIC_ROUND:
case AMDGPU::G_INTRINSIC_ROUNDEVEN:
case AMDGPU::G_FSIN:
case AMDGPU::G_FCANONICALIZE:
case AMDGPU::G_AMDGPU_RCP_IFLAG:
case AMDGPU::G_FPTRUNC:
NegateOperand(MatchInfo->getOperand(1));
break;
case AMDGPU::G_INTRINSIC:
case AMDGPU::G_INTRINSIC_CONVERGENT: {
Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_sin:
NegateOperand(MatchInfo->getOperand(2));
break;
case Intrinsic::amdgcn_fmul_legacy:
NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
break;
case Intrinsic::amdgcn_fmed3:
NegateOperand(MatchInfo->getOperand(2));
NegateOperand(MatchInfo->getOperand(3));
NegateOperand(MatchInfo->getOperand(4));
break;
case Intrinsic::amdgcn_fma_legacy:
NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
NegateOperand(MatchInfo->getOperand(4));
break;
default:
llvm_unreachable("folding fneg not supported for this intrinsic");
}
break;
}
default:
llvm_unreachable("folding fneg not supported for this instruction");
}
Register Dst = MI.getOperand(0).getReg();
Register MatchInfoDst = MatchInfo->getOperand(0).getReg();
if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
// MatchInfo now has negated value so use that instead of old Dst.
replaceRegWith(MRI, Dst, MatchInfoDst);
} else {
// We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
// but replaceRegWith will replace defs as well. It is easier to replace one
// def with a new register.
LLT Type = MRI.getType(Dst);
Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);
// MatchInfo now has negated value so use that instead of old Dst.
replaceRegWith(MRI, Dst, NegatedMatchInfo);
// Recreate non negated value for other uses of old MatchInfoDst
auto NextInst = ++MatchInfo->getIterator();
Builder.setInstrAndDebugLoc(*NextInst);
Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
}
MI.eraseFromParent();
}
// TODO: Should return converted value / extension source and avoid introducing
// intermediate fptruncs in the apply function.
static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
Register Reg) {
const MachineInstr *Def = MRI.getVRegDef(Reg);
if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
Register SrcReg = Def->getOperand(1).getReg();
return MRI.getType(SrcReg) == LLT::scalar(16);
}
if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF();
bool LosesInfo = true;
Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
return !LosesInfo;
}
return false;
}
bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
Register Src0,
Register Src1,
Register Src2) const {
assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
Register SrcReg = MI.getOperand(1).getReg();
if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
return false;
return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
isFPExtFromF16OrConst(MRI, Src2);
}
void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
Register Src0,
Register Src1,
Register Src2) const {
// We expect fptrunc (fpext x) to fold out, and to constant fold any constant
// sources.
Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0);
Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0);
LLT Ty = MRI.getType(Src0);
auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1);
auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1);
auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2);
Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
MI.eraseFromParent();
}
bool AMDGPUCombinerHelper::matchCombineFmulWithSelectToFldexp(
MachineInstr &MI, MachineInstr &Sel,
std::function<void(MachineIRBuilder &)> &MatchInfo) const {
assert(MI.getOpcode() == TargetOpcode::G_FMUL);
assert(Sel.getOpcode() == TargetOpcode::G_SELECT);
assert(MI.getOperand(2).getReg() == Sel.getOperand(0).getReg());
Register Dst = MI.getOperand(0).getReg();
LLT DestTy = MRI.getType(Dst);
LLT ScalarDestTy = DestTy.getScalarType();
if ((ScalarDestTy != LLT::float64() && ScalarDestTy != LLT::float32() &&
ScalarDestTy != LLT::float16()) ||
!MRI.hasOneNonDBGUse(Sel.getOperand(0).getReg()))
return false;
Register SelectCondReg = Sel.getOperand(1).getReg();
MachineInstr *SelectTrue = MRI.getVRegDef(Sel.getOperand(2).getReg());
MachineInstr *SelectFalse = MRI.getVRegDef(Sel.getOperand(3).getReg());
const auto SelectTrueVal =
isConstantOrConstantSplatVectorFP(*SelectTrue, MRI);
if (!SelectTrueVal)
return false;
const auto SelectFalseVal =
isConstantOrConstantSplatVectorFP(*SelectFalse, MRI);
if (!SelectFalseVal)
return false;
if (SelectTrueVal->isNegative() != SelectFalseVal->isNegative())
return false;
// For f32, only non-inline constants should be transformed.
if (ScalarDestTy == LLT::float32() && TII.isInlineConstant(*SelectTrueVal) &&
TII.isInlineConstant(*SelectFalseVal))
return false;
int SelectTrueLog2Val = SelectTrueVal->getExactLog2Abs();
if (SelectTrueLog2Val == INT_MIN)
return false;
int SelectFalseLog2Val = SelectFalseVal->getExactLog2Abs();
if (SelectFalseLog2Val == INT_MIN)
return false;
MatchInfo = [=, &MI](MachineIRBuilder &Builder) {
LLT IntDestTy = DestTy.changeElementType(LLT::scalar(32));
auto NewSel = Builder.buildSelect(
IntDestTy, SelectCondReg,
Builder.buildConstant(IntDestTy, SelectTrueLog2Val),
Builder.buildConstant(IntDestTy, SelectFalseLog2Val));
Register XReg = MI.getOperand(1).getReg();
if (SelectTrueVal->isNegative()) {
auto NegX =
Builder.buildFNeg(DestTy, XReg, MRI.getVRegDef(XReg)->getFlags());
Builder.buildFLdexp(Dst, NegX, NewSel, MI.getFlags());
} else {
Builder.buildFLdexp(Dst, XReg, NewSel, MI.getFlags());
}
};
return true;
}
bool AMDGPUCombinerHelper::matchConstantIs32BitMask(Register Reg) const {
auto Res = getIConstantVRegValWithLookThrough(Reg, MRI);
if (!Res)
return false;
const uint64_t Val = Res->Value.getZExtValue();
unsigned MaskIdx = 0;
unsigned MaskLen = 0;
if (!isShiftedMask_64(Val, MaskIdx, MaskLen))
return false;
// Check if low 32 bits or high 32 bits are all ones.
return MaskLen >= 32 && ((MaskIdx == 0) || (MaskIdx == 64 - MaskLen));
}