llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
Austin Kerbow 89503bda38
[AMDGPU] Add structural stall heuristic to scheduling strategies (#169617)
Implements a structural stall heuristic that considers both resource
hazards and latency constraints when selecting instructions. In coexec,
this changes the pending queue from a binary “not ready to issue”
distinction into part of a unified candidate comparison. Pending
instructions still identify structural stalls in the current cycle, but
they are now evaluated directly against available instructions by stall
cost, making the heuristics both more intuitive and more expressive.

- Add getStructuralStallCycles() to GCNSchedStrategy that computes the
number of cycles an instruction must wait due to:
  - Resource conflicts on unbuffered resources (from the SchedModel)
  - Sequence-dependent hazards (from GCNHazardRecognizer)

- Add getHazardWaitStates() to GCNHazardRecognizer that returns the
number
of wait states until all hazards for an instruction are resolved,
providing cycle-accurate hazard information for scheduling heuristics.
2026-03-23 11:33:43 -07:00

322 lines
12 KiB
C++

//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Coexecution-focused scheduling strategy for AMDGPU.
//
//===----------------------------------------------------------------------===//
#include "AMDGPUCoExecSchedStrategy.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
#define DEBUG_TYPE "machine-scheduler"
namespace {
// Used to disable post-RA scheduling with function level granularity.
class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
public:
explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
: ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}
// Do nothing.
void schedule() override {}
};
} // namespace
static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
// pickOnlyChoice() releases pending instructions and checks for new hazards.
SUnit *OnlyChoice = Zone.pickOnlyChoice();
if (!Zone.Pending.empty())
return nullptr;
return OnlyChoice;
}
AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
const MachineSchedContext *C)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
// Use more accurate GCN pressure trackers.
UseGCNTrackers = true;
}
void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
unsigned NumRegionInstrs) {
GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
assert((PreRADirection == MISched::Unspecified ||
PreRADirection == MISched::TopDown) &&
"coexec scheduler only supports top-down scheduling");
RegionPolicy.OnlyTopDown = true;
RegionPolicy.OnlyBottomUp = false;
}
void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
// Coexecution scheduling strategy is only done top-down to support new
// resource balancing heuristics.
RegionPolicy.OnlyTopDown = true;
RegionPolicy.OnlyBottomUp = false;
GCNSchedStrategy::initialize(DAG);
}
SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
"coexec scheduler only supports top-down scheduling");
if (DAG->top() == DAG->bottom()) {
assert(Top.Available.empty() && Top.Pending.empty() &&
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return nullptr;
}
bool PickedPending = false;
SUnit *SU = nullptr;
do {
PickedPending = false;
SU = pickOnlyChoice(Top);
if (!SU) {
CandPolicy NoPolicy;
TopCand.reset(NoPolicy);
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
PickedPending, /*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find a candidate");
SU = TopCand.SU;
}
IsTopNode = true;
} while (SU->isScheduled);
if (PickedPending) {
unsigned ReadyCycle = SU->TopReadyCycle;
unsigned CurrentCycle = Top.getCurrCycle();
if (ReadyCycle > CurrentCycle)
Top.bumpCycle(ReadyCycle);
// checkHazard() does not expose the exact cycle where the hazard clears.
while (Top.checkHazard(SU))
Top.bumpCycle(Top.getCurrCycle() + 1);
Top.releasePending();
}
if (SU->isTopReady())
Top.removeReady(SU);
if (SU->isBottomReady())
Bot.removeReady(SU);
LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
<< *SU->getInstr());
assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
return SU;
}
void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
SchedBoundary &Zone, const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker, SchedCandidate &Cand,
bool &PickedPending, bool IsBottomUp) {
assert(Zone.isTop() && "coexec scheduler only supports top boundary");
assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
PickedPending = false;
if (DAG->isTrackingPressure()) {
if (!useGCNTrackers()) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
} else {
SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
}
}
auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
for (SUnit *SU : Q) {
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
VGPRPressure, IsBottomUp);
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
tryCandidate(Cand, TryCand, ZoneArg);
if (TryCand.Reason != NoCand) {
if (TryCand.ResDelta == SchedResourceDelta())
TryCand.initResourceDelta(Zone.DAG, SchedModel);
LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
PickedPending = FromPending;
Cand.setBest(TryCand);
} else {
printCandidateDecision(TryCand, Cand);
}
}
};
LLVM_DEBUG(dbgs() << "Available Q:\n");
EvaluateQueue(Zone.Available, /*FromPending=*/false);
LLVM_DEBUG(dbgs() << "Pending Q:\n");
EvaluateQueue(Zone.Pending, /*FromPending=*/true);
}
bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = FirstValid;
return true;
}
// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;
// Avoid exceeding the target's limit.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// We only compare a subset of features when comparing nodes between
// Top and Bottom boundary. Some properties are simply incomparable, in many
// other instances we should only override the other boundary if something
// is a clear good pick on one boundary. Skip heuristics that are more
// "tie-breaking" in nature.
bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
// For loops that are acyclic path limited, aggressively schedule for
// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
// heuristics to take precedence.
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;
// Otherwise compare candidates by the stall they would introduce if
// scheduled in the current cycle.
if (tryEffectiveStall(Cand, TryCand, *Zone))
return TryCand.Reason != NoCand;
}
// Keep clustered nodes together to encourage downstream peephole
// optimizations which may reduce resource requirements.
//
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
bool CandIsClusterSucc =
isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
bool TryCandIsClusterSucc =
isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;
if (SameBoundary) {
// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
return TryCand.Reason != NoCand;
}
// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
if (SameBoundary) {
// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return TryCand.Reason != NoCand;
// Avoid serializing long latency dependence chains.
// For acyclic path limited loops, latency was already checked above.
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;
// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
TryCand.Reason = NodeOrder;
return true;
}
}
return false;
}
bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary &Zone) const {
// Treat structural and latency stalls as a single scheduling cost for the
// current cycle.
struct StallCosts {
unsigned Ready = 0;
unsigned Structural = 0;
unsigned Latency = 0;
unsigned Effective = 0;
};
unsigned CurrCycle = Zone.getCurrCycle();
auto GetStallCosts = [&](SUnit *SU) {
unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
StallCosts Costs;
Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
Costs.Structural = getStructuralStallCycles(Zone, SU);
Costs.Latency = Zone.getLatencyStallCycles(SU);
Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency});
return Costs;
};
StallCosts TryCosts = GetStallCosts(TryCand.SU);
StallCosts CandCosts = GetStallCosts(Cand.SU);
LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
dbgs() << "Effective stalls: try=" << TryCosts.Effective
<< " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
<< ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
<< " (ready=" << CandCosts.Ready
<< ", struct=" << CandCosts.Structural
<< ", lat=" << CandCosts.Latency << ")\n";
});
return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
}
ScheduleDAGInstrs *
llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
<< C->MF->getName() << '\n');
return new GCNScheduleDAGMILive(
C, std::make_unique<AMDGPUCoExecSchedStrategy>(C));
}
ScheduleDAGInstrs *
llvm::createGCNNoopPostMachineScheduler(MachineSchedContext *C) {
LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
<< C->MF->getName() << '\n');
return new GCNNoopPostScheduleDAG(C);
}