[AMDGPU] Add ML-oriented coexec scheduler selection and queue handling (#169616)

This patch adds the initial coexec scheduler scaffold for machine
learning workloads on gfx1250.

It introduces function and module-level controls for selecting the
AMDGPU preRA and postRA schedulers, including an `amdgpu-workload-type`
module flag that maps ML workloads to coexec preRA scheduling and a nop
postRA scheduler by default.

It also updates the coexec scheduler to use a simplified top-down
candidate selection path that considers both available and pending
queues through a single flow, setting up follow-on heuristic work.
This commit is contained in:
Austin Kerbow 2026-03-23 09:30:01 -07:00 committed by GitHub
parent d69c670934
commit 3e4efe3ed4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 555 additions and 14 deletions

View File

@ -0,0 +1,283 @@
//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Coexecution-focused scheduling strategy for AMDGPU.
//
//===----------------------------------------------------------------------===//
#include "AMDGPUCoExecSchedStrategy.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
#define DEBUG_TYPE "machine-scheduler"
namespace {
// Used to disable post-RA scheduling with function level granularity.
class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
public:
explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
: ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}
// Do nothing.
void schedule() override {}
};
} // namespace
static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
// pickOnlyChoice() releases pending instructions and checks for new hazards.
SUnit *OnlyChoice = Zone.pickOnlyChoice();
if (!Zone.Pending.empty())
return nullptr;
return OnlyChoice;
}
AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
const MachineSchedContext *C)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
// Use more accurate GCN pressure trackers.
UseGCNTrackers = true;
}
void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
unsigned NumRegionInstrs) {
GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
assert((PreRADirection == MISched::Unspecified ||
PreRADirection == MISched::TopDown) &&
"coexec scheduler only supports top-down scheduling");
RegionPolicy.OnlyTopDown = true;
RegionPolicy.OnlyBottomUp = false;
}
void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
// Coexecution scheduling strategy is only done top-down to support new
// resource balancing heuristics.
RegionPolicy.OnlyTopDown = true;
RegionPolicy.OnlyBottomUp = false;
GCNSchedStrategy::initialize(DAG);
}
SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
"coexec scheduler only supports top-down scheduling");
if (DAG->top() == DAG->bottom()) {
assert(Top.Available.empty() && Top.Pending.empty() &&
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return nullptr;
}
bool PickedPending = false;
SUnit *SU = nullptr;
do {
PickedPending = false;
SU = pickOnlyChoice(Top);
if (!SU) {
CandPolicy NoPolicy;
TopCand.reset(NoPolicy);
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
PickedPending, /*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find a candidate");
SU = TopCand.SU;
}
IsTopNode = true;
} while (SU->isScheduled);
if (PickedPending) {
unsigned ReadyCycle = SU->TopReadyCycle;
unsigned CurrentCycle = Top.getCurrCycle();
if (ReadyCycle > CurrentCycle)
Top.bumpCycle(ReadyCycle);
// checkHazard() does not expose the exact cycle where the hazard clears.
while (Top.checkHazard(SU))
Top.bumpCycle(Top.getCurrCycle() + 1);
Top.releasePending();
}
if (SU->isTopReady())
Top.removeReady(SU);
if (SU->isBottomReady())
Bot.removeReady(SU);
LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
<< *SU->getInstr());
assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
return SU;
}
void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
SchedBoundary &Zone, const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker, SchedCandidate &Cand,
bool &PickedPending, bool IsBottomUp) {
assert(Zone.isTop() && "coexec scheduler only supports top boundary");
assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
PickedPending = false;
if (DAG->isTrackingPressure()) {
if (!useGCNTrackers()) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
} else {
SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
}
}
auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
for (SUnit *SU : Q) {
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
VGPRPressure, IsBottomUp);
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
tryCandidate(Cand, TryCand, ZoneArg);
if (TryCand.Reason != NoCand) {
if (TryCand.ResDelta == SchedResourceDelta())
TryCand.initResourceDelta(Zone.DAG, SchedModel);
LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
PickedPending = FromPending;
Cand.setBest(TryCand);
} else {
printCandidateDecision(TryCand, Cand);
}
}
};
LLVM_DEBUG(dbgs() << "Available Q:\n");
EvaluateQueue(Zone.Available, /*FromPending=*/false);
LLVM_DEBUG(dbgs() << "Pending Q:\n");
EvaluateQueue(Zone.Pending, /*FromPending=*/true);
}
bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = FirstValid;
return true;
}
// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;
// Avoid exceeding the target's limit.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// We only compare a subset of features when comparing nodes between
// Top and Bottom boundary. Some properties are simply incomparable, in many
// other instances we should only override the other boundary if something
// is a clear good pick on one boundary. Skip heuristics that are more
// "tie-breaking" in nature.
bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
// For loops that are acyclic path limited, aggressively schedule for
// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
// heuristics to take precedence.
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;
// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return TryCand.Reason != NoCand;
}
// Keep clustered nodes together to encourage downstream peephole
// optimizations which may reduce resource requirements.
//
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
bool CandIsClusterSucc =
isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
bool TryCandIsClusterSucc =
isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;
if (SameBoundary) {
// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
return TryCand.Reason != NoCand;
}
// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
if (SameBoundary) {
// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return TryCand.Reason != NoCand;
// Avoid serializing long latency dependence chains.
// For acyclic path limited loops, latency was already checked above.
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;
// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
TryCand.Reason = NodeOrder;
return true;
}
}
return false;
}
ScheduleDAGInstrs *
llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
<< C->MF->getName() << '\n');
return new GCNScheduleDAGMILive(
C, std::make_unique<AMDGPUCoExecSchedStrategy>(C));
}
ScheduleDAGInstrs *
llvm::createGCNNoopPostMachineScheduler(MachineSchedContext *C) {
LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
<< C->MF->getName() << '\n');
return new GCNNoopPostScheduleDAG(C);
}

View File

@ -0,0 +1,46 @@
//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Coexecution-focused scheduling strategy for AMDGPU.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
#include "GCNSchedStrategy.h"
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
protected:
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone) const override;
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
SchedCandidate &Cand, bool &PickedPending,
bool IsBottomUp);
public:
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);
void initPolicy(MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
unsigned NumRegionInstrs) override;
void initialize(ScheduleDAGMI *DAG) override;
SUnit *pickNode(bool &IsTopNode) override;
};
ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
ScheduleDAGInstrs *createGCNNoopPostMachineScheduler(MachineSchedContext *C);
} // End namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H

View File

@ -18,6 +18,7 @@
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUBarrierLatency.h"
#include "AMDGPUCoExecSchedStrategy.h"
#include "AMDGPUCtorDtorLowering.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUExportKernelRuntimeHandles.h"
@ -89,6 +90,7 @@
#include "llvm/CodeGen/PostRAHazardRecognizer.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
@ -574,6 +576,38 @@ static cl::opt<std::string>
cl::desc("Select custom AMDGPU scheduling strategy."),
cl::Hidden, cl::init(""));
// Scheduler selection is consulted both when creating the scheduler and from
// overrideSchedPolicy(), so keep the attribute and global command line handling
// in one helper.
StringRef llvm::AMDGPU::getSchedStrategy(const Function &F) {
Attribute SchedStrategyAttr = F.getFnAttribute("amdgpu-sched-strategy");
if (SchedStrategyAttr.isValid())
return SchedStrategyAttr.getValueAsString();
if (!AMDGPUSchedStrategy.empty())
return AMDGPUSchedStrategy;
return "";
}
static void
diagnoseUnsupportedCoExecSchedulerSelection(const Function &F,
const GCNSubtarget &ST) {
if (ST.hasGFX1250Insts())
return;
F.getContext().diagnose(DiagnosticInfoUnsupported(
F, "'amdgpu-sched-strategy'='coexec' is only supported for gfx1250",
DiagnosticLocation(), DS_Warning));
}
static bool useNoopPostScheduler(const Function &F) {
Attribute PostSchedStrategyAttr =
F.getFnAttribute("amdgpu-post-sched-strategy");
return PostSchedStrategyAttr.isValid() &&
PostSchedStrategyAttr.getValueAsString() == "nop";
}
static cl::opt<bool> EnableRewritePartialRegUses(
"amdgpu-enable-rewrite-partial-reg-uses",
cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
@ -1244,11 +1278,7 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);
Attribute SchedStrategyAttr =
C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
StringRef SchedStrategy = SchedStrategyAttr.isValid()
? SchedStrategyAttr.getValueAsString()
: AMDGPUSchedStrategy;
StringRef SchedStrategy = AMDGPU::getSchedStrategy(C->MF->getFunction());
if (SchedStrategy == "max-ilp")
return createGCNMaxILPMachineScheduler(C);
@ -1265,11 +1295,19 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
if (SchedStrategy == "iterative-maxocc")
return createIterativeGCNMaxOccupancyMachineScheduler(C);
if (SchedStrategy == "coexec") {
diagnoseUnsupportedCoExecSchedulerSelection(C->MF->getFunction(), ST);
return createGCNCoExecMachineScheduler(C);
}
return createGCNMaxOccupancyMachineScheduler(C);
}
ScheduleDAGInstrs *
GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
if (useNoopPostScheduler(C->MF->getFunction()))
return createGCNNoopPostMachineScheduler(C);
ScheduleDAGMI *DAG =
new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(C),
/*RemoveKillFlags=*/true);

View File

@ -27,6 +27,10 @@ namespace llvm {
// AMDGPU Target Machine (R600+)
//===----------------------------------------------------------------------===//
namespace AMDGPU {
StringRef getSchedStrategy(const Function &F);
}
class AMDGPUTargetMachine : public CodeGenTargetMachineImpl {
protected:
std::unique_ptr<TargetLoweringObjectFile> TLOF;

View File

@ -86,6 +86,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUMemoryUtils.cpp
AMDGPUCoExecSchedStrategy.cpp
AMDGPUIGroupLP.cpp
AMDGPULowerVGPREncoding.cpp
AMDGPUMCResourceInfo.cpp

View File

@ -108,6 +108,8 @@ const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
: GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
if (GCNTrackers.getNumOccurrences() > 0)
GCNTrackersOverride = GCNTrackers;
}
void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
@ -196,7 +198,7 @@ static bool canUsePressureDiffs(const SUnit &SU) {
return true;
}
static void getRegisterPressures(
void GCNSchedStrategy::getRegisterPressures(
bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
@ -204,7 +206,7 @@ static void getRegisterPressures(
// getDownwardPressure() and getUpwardPressure() make temporary changes to
// the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
if (!GCNTrackers) {
if (!useGCNTrackers()) {
AtTop
? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
: TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
@ -256,7 +258,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
//
// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
// PressureDiffs.
if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
if (AtTop || !canUsePressureDiffs(*SU) || useGCNTrackers()) {
getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
DownwardTracker, UpwardTracker, DAG, SRI);
} else {
@ -400,7 +402,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
unsigned VGPRPressure = 0;
IsPending = false;
if (DAG->isTrackingPressure()) {
if (!GCNTrackers) {
if (!useGCNTrackers()) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
} else {
@ -623,7 +625,7 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
}
void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (GCNTrackers) {
if (useGCNTrackers()) {
MachineInstr *MI = SU->getInstr();
IsTopNode ? (void)DownwardTracker.advance(MI, false)
: UpwardTracker.recede(*MI);
@ -707,7 +709,8 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
GCNTrackers = GCNTrackers & !IsLegacyScheduler;
if (IsLegacyScheduler)
GCNTrackersOverride = std::nullopt;
}
GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
@ -1136,9 +1139,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
void GCNScheduleDAGMILive::runSchedStages() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
if (!Regions.empty()) {
BBLiveInMap = getRegionLiveInMap();
if (GCNTrackers)
if (S.useGCNTrackers())
RegionLiveOuts.buildLiveRegMap();
}
@ -1150,7 +1154,6 @@ void GCNScheduleDAGMILive::runSchedStages() {
}
#endif
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
auto Stage = createSchedStage(S.getCurrentStage());
if (!Stage->initGCNSchedStage())
@ -1166,7 +1169,7 @@ void GCNScheduleDAGMILive::runSchedStages() {
continue;
}
if (GCNTrackers) {
if (S.useGCNTrackers()) {
GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
GCNRPTracker::LiveRegSet *RegionLiveIns =

View File

@ -74,6 +74,13 @@ protected:
void printCandidateDecision(const SchedCandidate &Current,
const SchedCandidate &Preferred);
void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker,
SUnit *SU, std::vector<unsigned> &Pressure,
std::vector<unsigned> &MaxPressure,
GCNDownwardRPTracker &DownwardTracker,
GCNUpwardRPTracker &UpwardTracker,
ScheduleDAGMI *DAG, const SIRegisterInfo *SRI);
std::vector<unsigned> Pressure;
std::vector<unsigned> MaxPressure;
@ -98,6 +105,10 @@ protected:
// GCN RP Tracker for botttom-up scheduling
mutable GCNUpwardRPTracker UpwardTracker;
bool UseGCNTrackers = false;
std::optional<bool> GCNTrackersOverride;
public:
// schedule() have seen register pressure over the critical limits and had to
// track register pressure for actual scheduling heuristics.
@ -145,6 +156,10 @@ public:
bool hasNextStage() const;
bool useGCNTrackers() const {
return GCNTrackersOverride.value_or(UseGCNTrackers);
}
GCNSchedStageID getNextStage() const;
GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; }

View File

@ -338,6 +338,13 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
// SIRegisterInfo::getRegPressureSetLimit()
Policy.ShouldTrackPressure = true;
const Function &F = Region.RegionBegin->getMF()->getFunction();
if (AMDGPU::getSchedStrategy(F) == "coexec") {
Policy.OnlyTopDown = true;
Policy.OnlyBottomUp = false;
return;
}
// Enabling both top down and bottom up scheduling seems to give us less
// register spills than just using one of these approaches on its own.
Policy.OnlyTopDown = false;

View File

@ -0,0 +1,124 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -verify-misched %s -o - | FileCheck -check-prefix=DEFAULT %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec -verify-misched %s -o - | FileCheck -check-prefix=COEXEC %s
--- |
; Pre-commit test for stall heuristic
define void @test-sched-effective-stall() #0 { ret void }
define void @test-sched-pending-structural-stall() #0 { ret void }
attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
...
name: test-sched-effective-stall
tracksRegLiveness: true
body: |
bb.0:
; DEFAULT-LABEL: name: test-sched-effective-stall
; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF5]], 0, 0, implicit $exec
; DEFAULT-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF10:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF6]], [[DEF7]], 0, [[DEF8]], [[DEF9]], [[DEF10]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
;
; COEXEC-LABEL: name: test-sched-effective-stall
; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
; COEXEC-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; COEXEC-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; COEXEC-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; COEXEC-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
%0:vreg_512_align2 = IMPLICIT_DEF
%1:vreg_512_align2 = IMPLICIT_DEF
%2:vreg_256_align2 = IMPLICIT_DEF
%3:vgpr_32_lo256 = IMPLICIT_DEF
%4:vgpr_32_lo256 = IMPLICIT_DEF
%5:vreg_512_align2 = IMPLICIT_DEF
%6:vreg_512_align2 = IMPLICIT_DEF
%7:vreg_256_align2 = IMPLICIT_DEF
%8:vgpr_32_lo256 = IMPLICIT_DEF
%9:vgpr_32_lo256 = IMPLICIT_DEF
%10:vreg_64_align2 = IMPLICIT_DEF
%11:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %10, 0, 0, implicit $exec
%12:vreg_64_align2 = V_PK_ADD_F32 8, %11, 8, %11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
%13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
%14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5, %6, 0, %7, %8, %9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
S_ENDPGM 0, implicit %12, implicit %13, implicit %14
...
---
name: test-sched-pending-structural-stall
tracksRegLiveness: true
body: |
bb.0:
; DEFAULT-LABEL: name: test-sched-pending-structural-stall
; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; DEFAULT-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; DEFAULT-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; DEFAULT-NEXT: S_NOP 0
; DEFAULT-NEXT: S_NOP 0
; DEFAULT-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; DEFAULT-NEXT: S_ENDPGM 0, implicit %10, implicit %11
;
; COEXEC-LABEL: name: test-sched-pending-structural-stall
; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; COEXEC-NEXT: S_NOP 0
; COEXEC-NEXT: S_NOP 0
; COEXEC-NEXT: S_ENDPGM 0, implicit %10, implicit %11
%0:vreg_512_align2 = IMPLICIT_DEF
%1:vreg_512_align2 = IMPLICIT_DEF
%2:vreg_256_align2 = IMPLICIT_DEF
%3:vgpr_32_lo256 = IMPLICIT_DEF
%4:vgpr_32_lo256 = IMPLICIT_DEF
%5:vreg_512_align2 = IMPLICIT_DEF
%6:vreg_512_align2 = IMPLICIT_DEF
%7:vreg_256_align2 = IMPLICIT_DEF
%8:vgpr_32_lo256 = IMPLICIT_DEF
%9:vgpr_32_lo256 = IMPLICIT_DEF
%10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
%11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5, %6, 0, %7, %8, %9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
S_NOP 0
S_NOP 0
S_ENDPGM 0, implicit %10, implicit %11
...

View File

@ -0,0 +1,20 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=GFX1100
# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec %s -o /dev/null 2>&1 | FileCheck --allow-empty %s --check-prefix=GFX1250
# GFX1100: warning: {{.*}}'amdgpu-sched-strategy'='coexec' is only supported for gfx1250
# GFX1250-NOT: warning:
--- |
define void @coexec_sched_warning() #0 { ret void }
attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
...
---
name: coexec_sched_warning
tracksRegLiveness: true
body: |
bb.0:
%0:vgpr_32 = IMPLICIT_DEF
S_ENDPGM 0, implicit %0
...