561 lines
19 KiB
C++
561 lines
19 KiB
C++
//===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPUWaitSGPRHazards.h"
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
#include "SIInstrInfo.h"
|
|
#include "llvm/ADT/SetVector.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"
|
|
|
|
static cl::opt<bool> GlobalEnableSGPRHazardWaits(
|
|
"amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden,
|
|
cl::desc("Enable required s_wait_alu on SGPR hazards"));
|
|
|
|
static cl::opt<bool> GlobalCullSGPRHazardsOnFunctionBoundary(
|
|
"amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden,
|
|
cl::desc("Cull hazards on function boundaries"));
|
|
|
|
static cl::opt<bool>
|
|
GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull",
|
|
cl::init(false), cl::Hidden,
|
|
cl::desc("Cull hazards on memory waits"));
|
|
|
|
static cl::opt<unsigned> GlobalCullSGPRHazardsMemWaitThreshold(
|
|
"amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden,
|
|
cl::desc("Number of tracked SGPRs before initiating hazard cull on memory "
|
|
"wait"));
|
|
|
|
namespace {
|
|
|
|
class AMDGPUWaitSGPRHazards {
|
|
public:
|
|
const SIInstrInfo *TII;
|
|
const SIRegisterInfo *TRI;
|
|
const MachineRegisterInfo *MRI;
|
|
unsigned DsNopCount;
|
|
|
|
bool EnableSGPRHazardWaits;
|
|
bool CullSGPRHazardsOnFunctionBoundary;
|
|
bool CullSGPRHazardsAtMemWait;
|
|
unsigned CullSGPRHazardsMemWaitThreshold;
|
|
|
|
AMDGPUWaitSGPRHazards() {}
|
|
|
|
// Return the numeric ID 0-127 for a given SGPR.
|
|
static std::optional<unsigned> sgprNumber(Register Reg,
|
|
const SIRegisterInfo &TRI) {
|
|
switch (Reg) {
|
|
case AMDGPU::M0:
|
|
case AMDGPU::EXEC:
|
|
case AMDGPU::EXEC_LO:
|
|
case AMDGPU::EXEC_HI:
|
|
case AMDGPU::SGPR_NULL:
|
|
case AMDGPU::SGPR_NULL64:
|
|
return {};
|
|
default:
|
|
break;
|
|
}
|
|
unsigned RegN = TRI.getHWRegIndex(Reg);
|
|
if (RegN > 127)
|
|
return {};
|
|
return RegN;
|
|
}
|
|
|
|
static inline bool isVCC(Register Reg) {
|
|
return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
|
|
}
|
|
|
|
// Adjust global offsets for instructions bundled with S_GETPC_B64 after
|
|
// insertion of a new instruction.
|
|
static void updateGetPCBundle(MachineInstr *NewMI) {
|
|
if (!NewMI->isBundled())
|
|
return;
|
|
|
|
// Find start of bundle.
|
|
auto I = NewMI->getIterator();
|
|
while (I->isBundledWithPred())
|
|
I--;
|
|
if (I->isBundle())
|
|
I++;
|
|
|
|
// Bail if this is not an S_GETPC bundle.
|
|
if (I->getOpcode() != AMDGPU::S_GETPC_B64)
|
|
return;
|
|
|
|
// Update offsets of any references in the bundle.
|
|
const unsigned NewBytes = 4;
|
|
assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
|
|
"Unexpected instruction insertion in bundle");
|
|
auto NextMI = std::next(NewMI->getIterator());
|
|
auto End = NewMI->getParent()->end();
|
|
while (NextMI != End && NextMI->isBundledWithPred()) {
|
|
for (auto &Operand : NextMI->operands()) {
|
|
if (Operand.isGlobal())
|
|
Operand.setOffset(Operand.getOffset() + NewBytes);
|
|
}
|
|
NextMI++;
|
|
}
|
|
}
|
|
|
|
struct HazardState {
|
|
static constexpr unsigned None = 0;
|
|
static constexpr unsigned SALU = (1 << 0);
|
|
static constexpr unsigned VALU = (1 << 1);
|
|
|
|
std::bitset<64> Tracked; // SGPR banks ever read by VALU
|
|
std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU
|
|
std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU
|
|
unsigned VCCHazard = None; // Source of current VCC writes
|
|
bool ActiveFlat = false; // Has unwaited flat instructions
|
|
|
|
bool merge(const HazardState &RHS) {
|
|
HazardState Orig(*this);
|
|
*this |= RHS;
|
|
return (*this != Orig);
|
|
}
|
|
|
|
bool operator==(const HazardState &RHS) const {
|
|
return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards &&
|
|
VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard &&
|
|
ActiveFlat == RHS.ActiveFlat;
|
|
}
|
|
|
|
bool operator!=(const HazardState &RHS) const { return !(*this == RHS); }
|
|
|
|
void operator|=(const HazardState &RHS) {
|
|
Tracked |= RHS.Tracked;
|
|
SALUHazards |= RHS.SALUHazards;
|
|
VALUHazards |= RHS.VALUHazards;
|
|
VCCHazard |= RHS.VCCHazard;
|
|
ActiveFlat |= RHS.ActiveFlat;
|
|
}
|
|
};
|
|
|
|
struct BlockHazardState {
|
|
HazardState In;
|
|
HazardState Out;
|
|
};
|
|
|
|
DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState;
|
|
|
|
static constexpr unsigned WAVE32_NOPS = 4;
|
|
static constexpr unsigned WAVE64_NOPS = 8;
|
|
|
|
void insertHazardCull(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::instr_iterator &MI) {
|
|
assert(!MI->isBundled());
|
|
unsigned Count = DsNopCount;
|
|
while (Count--)
|
|
BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP));
|
|
}
|
|
|
|
unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
|
|
unsigned Mask = 0xffff;
|
|
Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
|
|
Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
|
|
AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
|
|
Mask = AMDGPU::DepCtr::encodeFieldVaVcc(
|
|
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(Mask1),
|
|
AMDGPU::DepCtr::decodeFieldVaVcc(Mask2)));
|
|
Mask = AMDGPU::DepCtr::encodeFieldVmVsrc(
|
|
Mask, std::min(AMDGPU::DepCtr::decodeFieldVmVsrc(Mask1),
|
|
AMDGPU::DepCtr::decodeFieldVmVsrc(Mask2)));
|
|
Mask = AMDGPU::DepCtr::encodeFieldVaSdst(
|
|
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(Mask1),
|
|
AMDGPU::DepCtr::decodeFieldVaSdst(Mask2)));
|
|
Mask = AMDGPU::DepCtr::encodeFieldVaVdst(
|
|
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1),
|
|
AMDGPU::DepCtr::decodeFieldVaVdst(Mask2)));
|
|
Mask = AMDGPU::DepCtr::encodeFieldHoldCnt(
|
|
Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1),
|
|
AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2)));
|
|
Mask = AMDGPU::DepCtr::encodeFieldVaSsrc(
|
|
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1),
|
|
AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2)));
|
|
return Mask;
|
|
}
|
|
|
|
bool mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator &MI,
|
|
unsigned Mask) {
|
|
auto MBB = MI->getParent();
|
|
if (MI == MBB->instr_begin())
|
|
return false;
|
|
|
|
auto It = prev_nodbg(MI, MBB->instr_begin());
|
|
if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR)
|
|
return false;
|
|
|
|
It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm()));
|
|
return true;
|
|
}
|
|
|
|
bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
|
|
enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };
|
|
|
|
HazardState State = BlockState[&MBB].In;
|
|
SmallSet<Register, 8> SeenRegs;
|
|
bool Emitted = false;
|
|
unsigned DsNops = 0;
|
|
|
|
for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(),
|
|
E = MBB.instr_end();
|
|
MI != E; ++MI) {
|
|
if (MI->isMetaInstruction())
|
|
continue;
|
|
|
|
// Clear tracked SGPRs if sufficient DS_NOPs occur
|
|
if (MI->getOpcode() == AMDGPU::DS_NOP) {
|
|
if (++DsNops >= DsNopCount)
|
|
State.Tracked.reset();
|
|
continue;
|
|
}
|
|
DsNops = 0;
|
|
|
|
// Snoop FLAT instructions to avoid adding culls before scratch/lds loads.
|
|
// Culls could be disproportionate in cost to load time.
|
|
if (SIInstrInfo::isFLAT(*MI) && !SIInstrInfo::isFLATGlobal(*MI))
|
|
State.ActiveFlat = true;
|
|
|
|
// SMEM or VMEM clears hazards
|
|
if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSMRD(*MI)) {
|
|
State.VCCHazard = HazardState::None;
|
|
State.SALUHazards.reset();
|
|
State.VALUHazards.reset();
|
|
continue;
|
|
}
|
|
|
|
// Existing S_WAITALU can clear hazards
|
|
if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
|
|
unsigned int Mask = MI->getOperand(0).getImm();
|
|
if (AMDGPU::DepCtr::decodeFieldVaVcc(Mask) == 0)
|
|
State.VCCHazard &= ~HazardState::VALU;
|
|
if (AMDGPU::DepCtr::decodeFieldSaSdst(Mask) == 0) {
|
|
State.SALUHazards.reset();
|
|
State.VCCHazard &= ~HazardState::SALU;
|
|
}
|
|
if (AMDGPU::DepCtr::decodeFieldVaSdst(Mask) == 0)
|
|
State.VALUHazards.reset();
|
|
continue;
|
|
}
|
|
|
|
// Snoop counter waits to insert culls
|
|
if (CullSGPRHazardsAtMemWait &&
|
|
(MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT ||
|
|
MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT ||
|
|
MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&
|
|
(MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) &&
|
|
(State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {
|
|
if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {
|
|
State.ActiveFlat = false;
|
|
} else {
|
|
State.Tracked.reset();
|
|
if (Emit)
|
|
insertHazardCull(MBB, MI);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Process only VALUs and SALUs
|
|
bool IsVALU = SIInstrInfo::isVALU(*MI);
|
|
bool IsSALU = SIInstrInfo::isSALU(*MI);
|
|
if (!IsVALU && !IsSALU)
|
|
continue;
|
|
|
|
unsigned Wait = 0;
|
|
|
|
auto processOperand = [&](const MachineOperand &Op, bool IsUse) {
|
|
if (!Op.isReg())
|
|
return;
|
|
Register Reg = Op.getReg();
|
|
assert(!Op.getSubReg());
|
|
if (!TRI->isSGPRReg(*MRI, Reg))
|
|
return;
|
|
|
|
// Only visit each register once
|
|
if (!SeenRegs.insert(Reg).second)
|
|
return;
|
|
|
|
auto RegNumber = sgprNumber(Reg, *TRI);
|
|
if (!RegNumber)
|
|
return;
|
|
|
|
// Track SGPRs by pair -- numeric ID of an 64b SGPR pair.
|
|
// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
|
|
unsigned RegN = *RegNumber;
|
|
unsigned PairN = (RegN >> 1) & 0x3f;
|
|
|
|
// Read/write of untracked register is safe; but must record any new
|
|
// reads.
|
|
if (!State.Tracked[PairN]) {
|
|
if (IsVALU && IsUse)
|
|
State.Tracked.set(PairN);
|
|
return;
|
|
}
|
|
|
|
uint8_t SGPRCount =
|
|
AMDGPU::getRegBitWidth(*TRI->getRegClassForReg(*MRI, Reg)) / 32;
|
|
|
|
if (IsUse) {
|
|
// SALU reading SGPR clears VALU hazards
|
|
if (IsSALU) {
|
|
if (isVCC(Reg)) {
|
|
if (State.VCCHazard & HazardState::VALU)
|
|
State.VCCHazard = HazardState::None;
|
|
} else {
|
|
State.VALUHazards.reset();
|
|
}
|
|
}
|
|
// Compute required waits
|
|
for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
|
|
Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;
|
|
Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;
|
|
}
|
|
if (isVCC(Reg) && State.VCCHazard) {
|
|
// Note: it's possible for both SALU and VALU to exist if VCC
|
|
// was updated differently by merged predecessors.
|
|
if (State.VCCHazard & HazardState::SALU)
|
|
Wait |= WA_SALU;
|
|
if (State.VCCHazard & HazardState::VALU)
|
|
Wait |= WA_VCC;
|
|
}
|
|
} else {
|
|
// Update hazards
|
|
if (isVCC(Reg)) {
|
|
State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;
|
|
} else {
|
|
for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
|
|
if (IsSALU)
|
|
State.SALUHazards.set(RegN + RegIdx);
|
|
else
|
|
State.VALUHazards.set(RegN + RegIdx);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
const bool IsSetPC =
|
|
(MI->isCall() || MI->isReturn() || MI->isIndirectBranch()) &&
|
|
MI->getOpcode() != AMDGPU::S_ENDPGM &&
|
|
MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED;
|
|
|
|
// Only consider implicit VCC specified by instruction descriptor.
|
|
const bool HasImplicitVCC =
|
|
llvm::any_of(MI->getDesc().implicit_uses(),
|
|
[](MCPhysReg Reg) { return isVCC(Reg); }) ||
|
|
llvm::any_of(MI->getDesc().implicit_defs(),
|
|
[](MCPhysReg Reg) { return isVCC(Reg); });
|
|
|
|
if (IsSetPC) {
|
|
// All SGPR writes before a call/return must be flushed as the
|
|
// callee/caller will not will not see the hazard chain.
|
|
if (State.VCCHazard & HazardState::VALU)
|
|
Wait |= WA_VCC;
|
|
if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU))
|
|
Wait |= WA_SALU;
|
|
if (State.VALUHazards.any())
|
|
Wait |= WA_VALU;
|
|
if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {
|
|
State.Tracked.reset();
|
|
if (Emit)
|
|
insertHazardCull(MBB, MI);
|
|
}
|
|
} else {
|
|
// Process uses to determine required wait.
|
|
SeenRegs.clear();
|
|
for (const MachineOperand &Op : MI->all_uses()) {
|
|
if (Op.isImplicit() &&
|
|
(!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg())))
|
|
continue;
|
|
processOperand(Op, true);
|
|
}
|
|
}
|
|
|
|
// Apply wait
|
|
if (Wait) {
|
|
unsigned Mask = 0xffff;
|
|
if (Wait & WA_VCC) {
|
|
State.VCCHazard &= ~HazardState::VALU;
|
|
Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
|
|
}
|
|
if (Wait & WA_SALU) {
|
|
State.SALUHazards.reset();
|
|
State.VCCHazard &= ~HazardState::SALU;
|
|
Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Mask, 0);
|
|
}
|
|
if (Wait & WA_VALU) {
|
|
State.VALUHazards.reset();
|
|
Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0);
|
|
}
|
|
if (Emit) {
|
|
if (!mergeConsecutiveWaitAlus(MI, Mask)) {
|
|
auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(),
|
|
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
|
|
.addImm(Mask);
|
|
updateGetPCBundle(NewMI);
|
|
}
|
|
Emitted = true;
|
|
}
|
|
}
|
|
|
|
// On return from a call SGPR state is unknown, so all potential hazards.
|
|
if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)
|
|
State.Tracked.set();
|
|
|
|
// Update hazards based on defs.
|
|
SeenRegs.clear();
|
|
for (const MachineOperand &Op : MI->all_defs()) {
|
|
if (Op.isImplicit() &&
|
|
(!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg())))
|
|
continue;
|
|
processOperand(Op, false);
|
|
}
|
|
}
|
|
|
|
BlockHazardState &BS = BlockState[&MBB];
|
|
bool Changed = State != BS.Out;
|
|
if (Emit) {
|
|
assert(!Changed && "Hazard state should not change on emit pass");
|
|
return Emitted;
|
|
}
|
|
if (Changed)
|
|
BS.Out = State;
|
|
return Changed;
|
|
}
|
|
|
|
bool run(MachineFunction &MF) {
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
if (!ST.hasVALUReadSGPRHazard())
|
|
return false;
|
|
|
|
// Parse settings
|
|
EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits;
|
|
CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary;
|
|
CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait;
|
|
CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold;
|
|
|
|
if (!GlobalEnableSGPRHazardWaits.getNumOccurrences())
|
|
EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger(
|
|
"amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);
|
|
if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences())
|
|
CullSGPRHazardsOnFunctionBoundary =
|
|
MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-boundary-cull");
|
|
if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences())
|
|
CullSGPRHazardsAtMemWait =
|
|
MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-mem-wait-cull");
|
|
if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences())
|
|
CullSGPRHazardsMemWaitThreshold =
|
|
MF.getFunction().getFnAttributeAsParsedInteger(
|
|
"amdgpu-sgpr-hazard-mem-wait-cull-threshold",
|
|
CullSGPRHazardsMemWaitThreshold);
|
|
|
|
// Bail if disabled
|
|
if (!EnableSGPRHazardWaits)
|
|
return false;
|
|
|
|
TII = ST.getInstrInfo();
|
|
TRI = ST.getRegisterInfo();
|
|
MRI = &MF.getRegInfo();
|
|
DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
|
|
|
|
auto CallingConv = MF.getFunction().getCallingConv();
|
|
if (!AMDGPU::isEntryFunctionCC(CallingConv) &&
|
|
!CullSGPRHazardsOnFunctionBoundary) {
|
|
// Callee must consider all SGPRs as tracked.
|
|
LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n");
|
|
MachineBasicBlock &EntryBlock = MF.front();
|
|
BlockState[&EntryBlock].In.Tracked.set();
|
|
}
|
|
|
|
// Calculate the hazard state for each basic block.
|
|
// Iterate until a fixed point is reached.
|
|
// Fixed point is guaranteed as merge function only ever increases
|
|
// the hazard set, and all backedges will cause a merge.
|
|
//
|
|
// Note: we have to take care of the entry block as this technically
|
|
// has an edge from outside the function. Failure to treat this as
|
|
// a merge could prevent fixed point being reached.
|
|
SetVector<MachineBasicBlock *> Worklist;
|
|
for (auto &MBB : reverse(MF))
|
|
Worklist.insert(&MBB);
|
|
while (!Worklist.empty()) {
|
|
auto &MBB = *Worklist.pop_back_val();
|
|
bool Changed = runOnMachineBasicBlock(MBB, false);
|
|
if (Changed) {
|
|
// Note: take a copy of state here in case it is reallocated by map
|
|
HazardState NewState = BlockState[&MBB].Out;
|
|
// Propagate to all successor blocks
|
|
for (auto Succ : MBB.successors()) {
|
|
// We only need to merge hazards at CFG merge points.
|
|
auto &SuccState = BlockState[Succ];
|
|
if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {
|
|
if (SuccState.In != NewState) {
|
|
SuccState.In = NewState;
|
|
Worklist.insert(Succ);
|
|
}
|
|
} else if (SuccState.In.merge(NewState)) {
|
|
Worklist.insert(Succ);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n");
|
|
|
|
// Final to emit wait instructions.
|
|
bool Changed = false;
|
|
for (auto &MBB : MF)
|
|
Changed |= runOnMachineBasicBlock(MBB, true);
|
|
|
|
BlockState.clear();
|
|
return Changed;
|
|
}
|
|
};
|
|
|
|
class AMDGPUWaitSGPRHazardsLegacy : public MachineFunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
return AMDGPUWaitSGPRHazards().run(MF);
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesCFG();
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
char AMDGPUWaitSGPRHazardsLegacy::ID = 0;
|
|
|
|
char &llvm::AMDGPUWaitSGPRHazardsLegacyID = AMDGPUWaitSGPRHazardsLegacy::ID;
|
|
|
|
INITIALIZE_PASS(AMDGPUWaitSGPRHazardsLegacy, DEBUG_TYPE,
|
|
"AMDGPU Insert waits for SGPR read hazards", false, false)
|
|
|
|
PreservedAnalyses
|
|
AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF,
|
|
MachineFunctionAnalysisManager &MFAM) {
|
|
if (AMDGPUWaitSGPRHazards().run(MF))
|
|
return PreservedAnalyses::none();
|
|
return PreservedAnalyses::all();
|
|
}
|