Reverts llvm/llvm-project#179652 This PR causes the out-of-memory build failures on many Windows builders.
805 lines
29 KiB
C++
805 lines
29 KiB
C++
//===-- SIPreEmitPeephole.cpp ------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// This pass performs the peephole optimizations before code emission.
|
|
///
|
|
/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
|
|
/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
|
|
/// co-issued. This helps with overlapping MFMA and certain vector instructions
|
|
/// in machine schedules and is expected to improve performance. Only those
|
|
/// packed instructions are unpacked that are overlapped by the MFMA latency.
|
|
/// Rest should remain untouched.
|
|
/// TODO: Add support for F16 packed instructions
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
#include "llvm/ADT/SetVector.h"
|
|
#include "llvm/CodeGen/MachineDominators.h"
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
#include "llvm/CodeGen/MachinePostDominators.h"
|
|
#include "llvm/CodeGen/TargetSchedule.h"
|
|
#include "llvm/Support/BranchProbability.h"
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "si-pre-emit-peephole"
|
|
|
|
namespace {
|
|
|
|
class SIPreEmitPeephole {
|
|
private:
|
|
const SIInstrInfo *TII = nullptr;
|
|
const SIRegisterInfo *TRI = nullptr;
|
|
|
|
bool optimizeVccBranch(MachineInstr &MI) const;
|
|
bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
|
|
bool getBlockDestinations(MachineBasicBlock &SrcMBB,
|
|
MachineBasicBlock *&TrueMBB,
|
|
MachineBasicBlock *&FalseMBB,
|
|
SmallVectorImpl<MachineOperand> &Cond);
|
|
bool mustRetainExeczBranch(const MachineInstr &Branch,
|
|
const MachineBasicBlock &From,
|
|
const MachineBasicBlock &To) const;
|
|
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
|
|
// Creates a list of packed instructions following an MFMA that are suitable
|
|
// for unpacking.
|
|
void collectUnpackingCandidates(MachineInstr &BeginMI,
|
|
SetVector<MachineInstr *> &InstrsToUnpack,
|
|
uint16_t NumMFMACycles);
|
|
// v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
|
|
// op_sel_hi:[0,0,0]
|
|
// ==>
|
|
// v_fma_f32 v0, v1, v3, v3
|
|
// v_fma_f32 v1, v0, v2, v2
|
|
// Here, we have overwritten v0 before we use it. This function checks if
|
|
// unpacking can lead to such a situation.
|
|
bool canUnpackingClobberRegister(const MachineInstr &MI);
|
|
// Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
|
|
// V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
|
|
// this transformation.
|
|
void performF32Unpacking(MachineInstr &I);
|
|
// Select corresponding unpacked instruction
|
|
uint16_t mapToUnpackedOpcode(MachineInstr &I);
|
|
// Creates the unpacked instruction to be inserted. Adds source modifiers to
|
|
// the unpacked instructions based on the source modifiers in the packed
|
|
// instruction.
|
|
MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode,
|
|
bool IsHiBits);
|
|
// Process operands/source modifiers from packed instructions and insert the
|
|
// appropriate source modifers and operands into the unpacked instructions.
|
|
void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods,
|
|
bool IsHiBits, const MachineOperand &SrcMO);
|
|
|
|
public:
|
|
bool run(MachineFunction &MF);
|
|
};
|
|
|
|
class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
return SIPreEmitPeephole().run(MF);
|
|
}
|
|
};
|
|
|
|
} // End anonymous namespace.
|
|
|
|
INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE,
|
|
"SI peephole optimizations", false, false)
|
|
|
|
char SIPreEmitPeepholeLegacy::ID = 0;
|
|
|
|
char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID;
|
|
|
|
bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
|
|
// Match:
|
|
// sreg = -1 or 0
|
|
// vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
|
|
// S_CBRANCH_VCC[N]Z
|
|
// =>
|
|
// S_CBRANCH_EXEC[N]Z
|
|
// We end up with this pattern sometimes after basic block placement.
|
|
// It happens while combining a block which assigns -1 or 0 to a saved mask
|
|
// and another block which consumes that saved mask and then a branch.
|
|
//
|
|
// While searching this also performs the following substitution:
|
|
// vcc = V_CMP
|
|
// vcc = S_AND exec, vcc
|
|
// S_CBRANCH_VCC[N]Z
|
|
// =>
|
|
// vcc = V_CMP
|
|
// S_CBRANCH_VCC[N]Z
|
|
|
|
bool Changed = false;
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
|
|
const bool IsWave32 = ST.isWave32();
|
|
const unsigned CondReg = TRI->getVCC();
|
|
const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
|
const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
|
|
const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
|
|
const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
|
|
|
MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
|
|
E = MBB.rend();
|
|
bool ReadsCond = false;
|
|
unsigned Threshold = 5;
|
|
for (++A; A != E; ++A) {
|
|
if (!--Threshold)
|
|
return false;
|
|
if (A->modifiesRegister(ExecReg, TRI))
|
|
return false;
|
|
if (A->modifiesRegister(CondReg, TRI)) {
|
|
if (!A->definesRegister(CondReg, TRI) ||
|
|
(A->getOpcode() != And && A->getOpcode() != AndN2))
|
|
return false;
|
|
break;
|
|
}
|
|
ReadsCond |= A->readsRegister(CondReg, TRI);
|
|
}
|
|
if (A == E)
|
|
return false;
|
|
|
|
MachineOperand &Op1 = A->getOperand(1);
|
|
MachineOperand &Op2 = A->getOperand(2);
|
|
if ((!Op1.isReg() || Op1.getReg() != ExecReg) && Op2.isReg() &&
|
|
Op2.getReg() == ExecReg) {
|
|
TII->commuteInstruction(*A);
|
|
Changed = true;
|
|
}
|
|
if (!Op1.isReg() || Op1.getReg() != ExecReg)
|
|
return Changed;
|
|
if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
|
|
return Changed;
|
|
|
|
int64_t MaskValue = 0;
|
|
Register SReg;
|
|
if (Op2.isReg()) {
|
|
SReg = Op2.getReg();
|
|
auto M = std::next(A);
|
|
bool ReadsSreg = false;
|
|
bool ModifiesExec = false;
|
|
for (; M != E; ++M) {
|
|
if (M->definesRegister(SReg, TRI))
|
|
break;
|
|
if (M->modifiesRegister(SReg, TRI))
|
|
return Changed;
|
|
ReadsSreg |= M->readsRegister(SReg, TRI);
|
|
ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
|
|
}
|
|
if (M == E)
|
|
return Changed;
|
|
// If SReg is VCC and SReg definition is a VALU comparison.
|
|
// This means S_AND with EXEC is not required.
|
|
// Erase the S_AND and return.
|
|
// Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
|
|
if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
|
|
TII->isVOPC(*M)) {
|
|
A->eraseFromParent();
|
|
return true;
|
|
}
|
|
if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
|
|
(M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
|
|
return Changed;
|
|
MaskValue = M->getOperand(1).getImm();
|
|
// First if sreg is only used in the AND instruction fold the immediate
|
|
// into the AND.
|
|
if (!ReadsSreg && Op2.isKill()) {
|
|
A->getOperand(2).ChangeToImmediate(MaskValue);
|
|
M->eraseFromParent();
|
|
}
|
|
} else if (Op2.isImm()) {
|
|
MaskValue = Op2.getImm();
|
|
} else {
|
|
llvm_unreachable("Op2 must be register or immediate");
|
|
}
|
|
|
|
// Invert mask for s_andn2
|
|
assert(MaskValue == 0 || MaskValue == -1);
|
|
if (A->getOpcode() == AndN2)
|
|
MaskValue = ~MaskValue;
|
|
|
|
if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
|
|
if (!MI.killsRegister(CondReg, TRI)) {
|
|
// Replace AND with MOV
|
|
if (MaskValue == 0) {
|
|
BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
|
|
.addImm(0);
|
|
} else {
|
|
BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
|
|
.addReg(ExecReg);
|
|
}
|
|
}
|
|
// Remove AND instruction
|
|
A->eraseFromParent();
|
|
}
|
|
|
|
bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
|
|
if (SReg == ExecReg) {
|
|
// EXEC is updated directly
|
|
if (IsVCCZ) {
|
|
MI.eraseFromParent();
|
|
return true;
|
|
}
|
|
MI.setDesc(TII->get(AMDGPU::S_BRANCH));
|
|
} else if (IsVCCZ && MaskValue == 0) {
|
|
// Will always branch
|
|
// Remove all successors shadowed by new unconditional branch
|
|
MachineBasicBlock *Parent = MI.getParent();
|
|
SmallVector<MachineInstr *, 4> ToRemove;
|
|
bool Found = false;
|
|
for (MachineInstr &Term : Parent->terminators()) {
|
|
if (Found) {
|
|
if (Term.isBranch())
|
|
ToRemove.push_back(&Term);
|
|
} else {
|
|
Found = Term.isIdenticalTo(MI);
|
|
}
|
|
}
|
|
assert(Found && "conditional branch is not terminator");
|
|
for (auto *BranchMI : ToRemove) {
|
|
MachineOperand &Dst = BranchMI->getOperand(0);
|
|
assert(Dst.isMBB() && "destination is not basic block");
|
|
Parent->removeSuccessor(Dst.getMBB());
|
|
BranchMI->eraseFromParent();
|
|
}
|
|
|
|
if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
|
|
Parent->removeSuccessor(Succ);
|
|
}
|
|
|
|
// Rewrite to unconditional branch
|
|
MI.setDesc(TII->get(AMDGPU::S_BRANCH));
|
|
} else if (!IsVCCZ && MaskValue == 0) {
|
|
// Will never branch
|
|
MachineOperand &Dst = MI.getOperand(0);
|
|
assert(Dst.isMBB() && "destination is not basic block");
|
|
MI.getParent()->removeSuccessor(Dst.getMBB());
|
|
MI.eraseFromParent();
|
|
return true;
|
|
} else if (MaskValue == -1) {
|
|
// Depends only on EXEC
|
|
MI.setDesc(
|
|
TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
|
|
}
|
|
|
|
MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/));
|
|
MI.addImplicitDefUseOperands(*MBB.getParent());
|
|
|
|
return true;
|
|
}
|
|
|
|
bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
|
|
MachineInstr &MI) const {
|
|
MachineBasicBlock &MBB = *MI.getParent();
|
|
const MachineFunction &MF = *MBB.getParent();
|
|
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
|
|
Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
|
|
SmallVector<MachineInstr *, 4> ToRemove;
|
|
bool IdxOn = true;
|
|
|
|
if (!MI.isIdenticalTo(First))
|
|
return false;
|
|
|
|
// Scan back to find an identical S_SET_GPR_IDX_ON
|
|
for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
|
|
E = MI.getIterator();
|
|
I != E; ++I) {
|
|
if (I->isBundle() || I->isDebugInstr())
|
|
continue;
|
|
switch (I->getOpcode()) {
|
|
case AMDGPU::S_SET_GPR_IDX_MODE:
|
|
return false;
|
|
case AMDGPU::S_SET_GPR_IDX_OFF:
|
|
IdxOn = false;
|
|
ToRemove.push_back(&*I);
|
|
break;
|
|
default:
|
|
if (I->modifiesRegister(AMDGPU::M0, TRI))
|
|
return false;
|
|
if (IdxReg && I->modifiesRegister(IdxReg, TRI))
|
|
return false;
|
|
if (llvm::any_of(I->operands(), [&MRI, this](const MachineOperand &MO) {
|
|
return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg());
|
|
})) {
|
|
// The only exception allowed here is another indirect vector move
|
|
// with the same mode.
|
|
if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
|
|
I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
MI.eraseFromBundle();
|
|
for (MachineInstr *RI : ToRemove)
|
|
RI->eraseFromBundle();
|
|
return true;
|
|
}
|
|
|
|
bool SIPreEmitPeephole::getBlockDestinations(
|
|
MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
|
|
MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
|
|
if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
|
|
return false;
|
|
|
|
if (!FalseMBB)
|
|
FalseMBB = SrcMBB.getNextNode();
|
|
|
|
return true;
|
|
}
|
|
|
|
namespace {
|
|
class BranchWeightCostModel {
|
|
const SIInstrInfo &TII;
|
|
const TargetSchedModel &SchedModel;
|
|
BranchProbability BranchProb;
|
|
static constexpr uint64_t BranchNotTakenCost = 1;
|
|
uint64_t BranchTakenCost;
|
|
uint64_t ThenCyclesCost = 0;
|
|
|
|
public:
|
|
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
|
|
const MachineBasicBlock &Succ)
|
|
: TII(TII), SchedModel(TII.getSchedModel()) {
|
|
const MachineBasicBlock &Head = *Branch.getParent();
|
|
const auto *FromIt = find(Head.successors(), &Succ);
|
|
assert(FromIt != Head.succ_end());
|
|
|
|
BranchProb = Head.getSuccProbability(FromIt);
|
|
if (BranchProb.isUnknown())
|
|
BranchProb = BranchProbability::getZero();
|
|
BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
|
|
}
|
|
|
|
bool isProfitable(const MachineInstr &MI) {
|
|
if (TII.isWaitcnt(MI.getOpcode()))
|
|
return false;
|
|
|
|
ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
|
|
|
|
// Consider `P = N/D` to be the probability of execz being false (skipping
|
|
// the then-block) The transformation is profitable if always executing the
|
|
// 'then' block is cheaper than executing sometimes 'then' and always
|
|
// executing s_cbranch_execz:
|
|
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
|
|
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
|
|
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
|
|
// BranchNotTakenCost
|
|
uint64_t Numerator = BranchProb.getNumerator();
|
|
uint64_t Denominator = BranchProb.getDenominator();
|
|
return (Denominator - Numerator) * ThenCyclesCost <=
|
|
((Denominator - Numerator) * BranchTakenCost +
|
|
Numerator * BranchNotTakenCost);
|
|
}
|
|
};
|
|
|
|
bool SIPreEmitPeephole::mustRetainExeczBranch(
|
|
const MachineInstr &Branch, const MachineBasicBlock &From,
|
|
const MachineBasicBlock &To) const {
|
|
assert(is_contained(Branch.getParent()->successors(), &From));
|
|
BranchWeightCostModel CostModel{*TII, Branch, From};
|
|
|
|
const MachineFunction *MF = From.getParent();
|
|
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
|
|
MBBI != End && MBBI != ToI; ++MBBI) {
|
|
const MachineBasicBlock &MBB = *MBBI;
|
|
|
|
for (const MachineInstr &MI : MBB) {
|
|
// When a uniform loop is inside non-uniform control flow, the branch
|
|
// leaving the loop might never be taken when EXEC = 0.
|
|
// Hence we should retain cbranch out of the loop lest it become infinite.
|
|
if (MI.isConditionalBranch())
|
|
return true;
|
|
|
|
if (MI.isUnconditionalBranch() &&
|
|
TII->getBranchDestBlock(MI) != MBB.getNextNode())
|
|
return true;
|
|
|
|
if (MI.isMetaInstruction())
|
|
continue;
|
|
|
|
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
|
|
return true;
|
|
|
|
if (!CostModel.isProfitable(MI))
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
} // namespace
|
|
|
|
// Returns true if the skip branch instruction is removed.
|
|
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
|
|
MachineBasicBlock &SrcMBB) {
|
|
|
|
if (!TII->getSchedModel().hasInstrSchedModel())
|
|
return false;
|
|
|
|
MachineBasicBlock *TrueMBB = nullptr;
|
|
MachineBasicBlock *FalseMBB = nullptr;
|
|
SmallVector<MachineOperand, 1> Cond;
|
|
|
|
if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
|
|
return false;
|
|
|
|
// Consider only the forward branches.
|
|
if (SrcMBB.getNumber() >= TrueMBB->getNumber())
|
|
return false;
|
|
|
|
// Consider only when it is legal and profitable
|
|
if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
|
|
return false;
|
|
|
|
LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
|
|
MI.eraseFromParent();
|
|
SrcMBB.removeSuccessor(TrueMBB);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
|
|
unsigned OpCode = MI.getOpcode();
|
|
Register DstReg = MI.getOperand(0).getReg();
|
|
// Only the first register in the register pair needs to be checked due to the
|
|
// unpacking order. Packed instructions are unpacked such that the lower 32
|
|
// bits (i.e., the first register in the pair) are written first. This can
|
|
// introduce dependencies if the first register is written in one instruction
|
|
// and then read as part of the higher 32 bits in the subsequent instruction.
|
|
// Such scenarios can arise due to specific combinations of op_sel and
|
|
// op_sel_hi modifiers.
|
|
Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
|
|
|
|
const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
|
|
if (Src0MO && Src0MO->isReg()) {
|
|
Register SrcReg0 = Src0MO->getReg();
|
|
unsigned Src0Mods =
|
|
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
|
|
Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
|
|
? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
|
|
: TRI->getSubReg(SrcReg0, AMDGPU::sub0);
|
|
// Check if the register selected by op_sel_hi is the same as the first
|
|
// register in the destination register pair.
|
|
if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
|
|
return true;
|
|
}
|
|
|
|
const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
|
|
if (Src1MO && Src1MO->isReg()) {
|
|
Register SrcReg1 = Src1MO->getReg();
|
|
unsigned Src1Mods =
|
|
TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
|
|
Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
|
|
? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
|
|
: TRI->getSubReg(SrcReg1, AMDGPU::sub0);
|
|
if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
|
|
return true;
|
|
}
|
|
|
|
// Applicable for packed instructions with 3 source operands, such as
|
|
// V_PK_FMA.
|
|
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
|
|
const MachineOperand *Src2MO =
|
|
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
|
|
if (Src2MO && Src2MO->isReg()) {
|
|
Register SrcReg2 = Src2MO->getReg();
|
|
unsigned Src2Mods =
|
|
TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
|
|
Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
|
|
? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
|
|
: TRI->getSubReg(SrcReg2, AMDGPU::sub0);
|
|
if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
|
|
unsigned Opcode = I.getOpcode();
|
|
// Use 64 bit encoding to allow use of VOP3 instructions.
|
|
// VOP3 e64 instructions allow source modifiers
|
|
// e32 instructions don't allow source modifiers.
|
|
switch (Opcode) {
|
|
case AMDGPU::V_PK_ADD_F32:
|
|
return AMDGPU::V_ADD_F32_e64;
|
|
case AMDGPU::V_PK_MUL_F32:
|
|
return AMDGPU::V_MUL_F32_e64;
|
|
case AMDGPU::V_PK_FMA_F32:
|
|
return AMDGPU::V_FMA_F32_e64;
|
|
default:
|
|
return std::numeric_limits<uint16_t>::max();
|
|
}
|
|
llvm_unreachable("Fully covered switch");
|
|
}
|
|
|
|
void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
|
|
unsigned SrcMods, bool IsHiBits,
|
|
const MachineOperand &SrcMO) {
|
|
unsigned NewSrcMods = 0;
|
|
unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
|
|
unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
|
|
// Packed instructions (VOP3P) do not support ABS. Hence, no checks are done
|
|
// for ABS modifiers.
|
|
// If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
|
|
// lane.
|
|
// NEG_HI shares the same bit position with ABS. But packed instructions do
|
|
// not support ABS. Therefore, NEG_HI must be translated to NEG source
|
|
// modifier for the higher 32 bits. Unpacked VOP3 instructions support
|
|
// ABS, but do not support NEG_HI. Therefore we need to explicitly add the
|
|
// NEG modifier if present in the packed instruction.
|
|
if (SrcMods & NegModifier)
|
|
NewSrcMods |= SISrcMods::NEG;
|
|
// Src modifiers. Only negative modifiers are added if needed. Unpacked
|
|
// operations do not have op_sel, therefore it must be handled explicitly as
|
|
// done below.
|
|
NewMI.addImm(NewSrcMods);
|
|
if (SrcMO.isImm()) {
|
|
NewMI.addImm(SrcMO.getImm());
|
|
return;
|
|
}
|
|
// If op_sel == 0, select register 0 of reg:sub0_sub1.
|
|
Register UnpackedSrcReg = (SrcMods & OpSelModifier)
|
|
? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
|
|
: TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
|
|
|
|
MachineOperand UnpackedSrcMO =
|
|
MachineOperand::CreateReg(UnpackedSrcReg, /*isDef=*/false);
|
|
if (SrcMO.isKill()) {
|
|
// For each unpacked instruction, mark its source registers as killed if the
|
|
// corresponding source register in the original packed instruction was
|
|
// marked as killed.
|
|
//
|
|
// Exception:
|
|
// If the op_sel and op_sel_hi modifiers require both unpacked instructions
|
|
// to use the same register (e.g., due to overlapping access to low/high
|
|
// bits of the same packed register), then only the *second* (latter)
|
|
// instruction should mark the register as killed. This is because the
|
|
// second instruction handles the higher bits and is effectively the last
|
|
// user of the full register pair.
|
|
|
|
bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
|
|
bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
|
|
bool KillState = true;
|
|
if ((OpSel == OpSelHi) && !IsHiBits)
|
|
KillState = false;
|
|
UnpackedSrcMO.setIsKill(KillState);
|
|
}
|
|
NewMI.add(UnpackedSrcMO);
|
|
}
|
|
|
|
void SIPreEmitPeephole::collectUnpackingCandidates(
|
|
MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
|
|
uint16_t NumMFMACycles) {
|
|
auto *BB = BeginMI.getParent();
|
|
auto E = BB->end();
|
|
int TotalCyclesBetweenCandidates = 0;
|
|
auto SchedModel = TII->getSchedModel();
|
|
Register MFMADef = BeginMI.getOperand(0).getReg();
|
|
|
|
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
|
|
MachineInstr &Instr = *I;
|
|
uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
|
|
bool IsUnpackable =
|
|
!(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
|
|
if (Instr.isMetaInstruction())
|
|
continue;
|
|
if ((Instr.isTerminator()) ||
|
|
(TII->isNeverCoissue(Instr) && !IsUnpackable) ||
|
|
(SIInstrInfo::modifiesModeRegister(Instr) &&
|
|
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
|
|
return;
|
|
|
|
const MCSchedClassDesc *InstrSchedClassDesc =
|
|
SchedModel.resolveSchedClass(&Instr);
|
|
uint16_t Latency =
|
|
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
|
|
TotalCyclesBetweenCandidates += Latency;
|
|
|
|
if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
|
|
return;
|
|
// Identify register dependencies between those used by the MFMA
|
|
// instruction and the following packed instructions. Also checks for
|
|
// transitive dependencies between the MFMA def and candidate instruction
|
|
// def and uses. Conservatively ensures that we do not incorrectly
|
|
// read/write registers.
|
|
for (const MachineOperand &InstrMO : Instr.operands()) {
|
|
if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
|
|
continue;
|
|
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
|
|
return;
|
|
}
|
|
if (!IsUnpackable)
|
|
continue;
|
|
|
|
if (canUnpackingClobberRegister(Instr))
|
|
return;
|
|
// If it's a packed instruction, adjust latency: remove the packed
|
|
// latency, add latency of two unpacked instructions (currently estimated
|
|
// as 2 cycles).
|
|
TotalCyclesBetweenCandidates -= Latency;
|
|
// TODO: improve latency handling based on instruction modeling.
|
|
TotalCyclesBetweenCandidates += 2;
|
|
// Subtract 1 to account for MFMA issue latency.
|
|
if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
|
|
InstrsToUnpack.insert(&Instr);
|
|
}
|
|
}
|
|
|
|
void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
|
|
const MachineOperand &DstOp = I.getOperand(0);
|
|
|
|
uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
|
|
assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
|
|
"Unsupported Opcode");
|
|
|
|
MachineInstrBuilder Op0LOp1L =
|
|
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
|
|
MachineOperand LoDstOp = Op0LOp1L->getOperand(0);
|
|
|
|
LoDstOp.setIsUndef(DstOp.isUndef());
|
|
|
|
MachineInstrBuilder Op0HOp1H =
|
|
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
|
|
MachineOperand HiDstOp = Op0HOp1H->getOperand(0);
|
|
|
|
uint32_t IFlags = I.getFlags();
|
|
Op0LOp1L->setFlags(IFlags);
|
|
Op0HOp1H->setFlags(IFlags);
|
|
LoDstOp.setIsRenamable(DstOp.isRenamable());
|
|
HiDstOp.setIsRenamable(DstOp.isRenamable());
|
|
|
|
I.eraseFromParent();
|
|
}
|
|
|
|
MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
|
|
uint16_t UnpackedOpcode,
|
|
bool IsHiBits) {
|
|
MachineBasicBlock &MBB = *I.getParent();
|
|
const DebugLoc &DL = I.getDebugLoc();
|
|
const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
|
|
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
|
|
Register DstReg = I.getOperand(0).getReg();
|
|
unsigned OpCode = I.getOpcode();
|
|
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
|
|
: TRI->getSubReg(DstReg, AMDGPU::sub0);
|
|
|
|
int64_t ClampVal = TII->getNamedOperand(I, AMDGPU::OpName::clamp)->getImm();
|
|
unsigned Src0Mods =
|
|
TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm();
|
|
unsigned Src1Mods =
|
|
TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm();
|
|
|
|
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
|
|
NewMI.addDef(UnpackedDstReg); // vdst
|
|
addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
|
|
addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
|
|
|
|
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
|
|
const MachineOperand *SrcMO2 =
|
|
TII->getNamedOperand(I, AMDGPU::OpName::src2);
|
|
unsigned Src2Mods =
|
|
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
|
|
addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
|
|
}
|
|
NewMI.addImm(ClampVal); // clamp
|
|
// Packed instructions do not support output modifiers. safe to assign them 0
|
|
// for this use case
|
|
NewMI.addImm(0); // omod
|
|
return NewMI;
|
|
}
|
|
|
|
PreservedAnalyses
|
|
llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
|
|
MachineFunctionAnalysisManager &MFAM) {
|
|
auto *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
|
|
auto *MPDT = MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
|
|
|
|
if (SIPreEmitPeephole().run(MF))
|
|
return getMachineFunctionPassPreservedAnalyses();
|
|
|
|
if (MDT)
|
|
MDT->updateBlockNumbers();
|
|
if (MPDT)
|
|
MPDT->updateBlockNumbers();
|
|
return PreservedAnalyses::all();
|
|
}
|
|
|
|
bool SIPreEmitPeephole::run(MachineFunction &MF) {
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
TII = ST.getInstrInfo();
|
|
TRI = &TII->getRegisterInfo();
|
|
bool Changed = false;
|
|
|
|
MF.RenumberBlocks();
|
|
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
|
|
// Check first terminator for branches to optimize
|
|
if (TermI != MBB.end()) {
|
|
MachineInstr &MI = *TermI;
|
|
switch (MI.getOpcode()) {
|
|
case AMDGPU::S_CBRANCH_VCCZ:
|
|
case AMDGPU::S_CBRANCH_VCCNZ:
|
|
Changed |= optimizeVccBranch(MI);
|
|
break;
|
|
case AMDGPU::S_CBRANCH_EXECZ:
|
|
Changed |= removeExeczBranch(MI, MBB);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!ST.hasVGPRIndexMode())
|
|
continue;
|
|
|
|
MachineInstr *SetGPRMI = nullptr;
|
|
const unsigned Threshold = 20;
|
|
unsigned Count = 0;
|
|
// Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
|
|
// second is not needed. Do expensive checks in the optimizeSetGPR()
|
|
// and limit the distance to 20 instructions for compile time purposes.
|
|
// Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
|
|
// may be bundled with the instructions they modify.
|
|
for (auto &MI : make_early_inc_range(MBB.instrs())) {
|
|
if (Count == Threshold)
|
|
SetGPRMI = nullptr;
|
|
else
|
|
++Count;
|
|
|
|
if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
|
|
continue;
|
|
|
|
Count = 0;
|
|
if (!SetGPRMI) {
|
|
SetGPRMI = &MI;
|
|
continue;
|
|
}
|
|
|
|
if (optimizeSetGPR(*SetGPRMI, MI))
|
|
Changed = true;
|
|
else
|
|
SetGPRMI = &MI;
|
|
}
|
|
}
|
|
|
|
// TODO: Fold this into previous block, if possible. Evaluate and handle any
|
|
// side effects.
|
|
|
|
// Perform the extra MF scans only for supported archs
|
|
if (!ST.hasGFX940Insts())
|
|
return Changed;
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
// Unpack packed instructions overlapped by MFMAs. This allows the
|
|
// compiler to co-issue unpacked instructions with MFMA
|
|
auto SchedModel = TII->getSchedModel();
|
|
SetVector<MachineInstr *> InstrsToUnpack;
|
|
for (auto &MI : make_early_inc_range(MBB.instrs())) {
|
|
if (!SIInstrInfo::isMFMA(MI))
|
|
continue;
|
|
const MCSchedClassDesc *SchedClassDesc =
|
|
SchedModel.resolveSchedClass(&MI);
|
|
uint16_t NumMFMACycles =
|
|
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
|
|
collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
|
|
}
|
|
for (MachineInstr *MI : InstrsToUnpack) {
|
|
performF32Unpacking(*MI);
|
|
}
|
|
}
|
|
|
|
return Changed;
|
|
}
|