diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp new file mode 100644 index 000000000000..9d50c4e04794 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp @@ -0,0 +1,283 @@ +//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Coexecution-focused scheduling strategy for AMDGPU. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUCoExecSchedStrategy.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +namespace { + +// Used to disable post-RA scheduling with function level granularity. +class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs { +public: + explicit GCNNoopPostScheduleDAG(MachineSchedContext *C) + : ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {} + + // Do nothing. + void schedule() override {} +}; + +} // namespace + +static SUnit *pickOnlyChoice(SchedBoundary &Zone) { + // pickOnlyChoice() releases pending instructions and checks for new hazards. + SUnit *OnlyChoice = Zone.pickOnlyChoice(); + if (!Zone.Pending.empty()) + return nullptr; + + return OnlyChoice; +} + +AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy( + const MachineSchedContext *C) + : GCNSchedStrategy(C) { + SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule); + SchedStages.push_back(GCNSchedStageID::PreRARematerialize); + // Use more accurate GCN pressure trackers. + UseGCNTrackers = true; +} + +void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) { + GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs); + assert((PreRADirection == MISched::Unspecified || + PreRADirection == MISched::TopDown) && + "coexec scheduler only supports top-down scheduling"); + RegionPolicy.OnlyTopDown = true; + RegionPolicy.OnlyBottomUp = false; +} + +void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) { + // Coexecution scheduling strategy is only done top-down to support new + // resource balancing heuristics. + RegionPolicy.OnlyTopDown = true; + RegionPolicy.OnlyBottomUp = false; + + GCNSchedStrategy::initialize(DAG); +} + +SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) { + assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp && + "coexec scheduler only supports top-down scheduling"); + + if (DAG->top() == DAG->bottom()) { + assert(Top.Available.empty() && Top.Pending.empty() && + Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); + return nullptr; + } + + bool PickedPending = false; + SUnit *SU = nullptr; + do { + PickedPending = false; + SU = pickOnlyChoice(Top); + if (!SU) { + CandPolicy NoPolicy; + TopCand.reset(NoPolicy); + pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, + PickedPending, /*IsBottomUp=*/false); + assert(TopCand.Reason != NoCand && "failed to find a candidate"); + SU = TopCand.SU; + } + IsTopNode = true; + } while (SU->isScheduled); + + if (PickedPending) { + unsigned ReadyCycle = SU->TopReadyCycle; + unsigned CurrentCycle = Top.getCurrCycle(); + if (ReadyCycle > CurrentCycle) + Top.bumpCycle(ReadyCycle); + + // checkHazard() does not expose the exact cycle where the hazard clears. + while (Top.checkHazard(SU)) + Top.bumpCycle(Top.getCurrCycle() + 1); + + Top.releasePending(); + } + + if (SU->isTopReady()) + Top.removeReady(SU); + if (SU->isBottomReady()) + Bot.removeReady(SU); + + LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " + << *SU->getInstr()); + + assert(IsTopNode && "coexec scheduler must only schedule from top boundary"); + return SU; +} + +void AMDGPUCoExecSchedStrategy::pickNodeFromQueue( + SchedBoundary &Zone, const CandPolicy &ZonePolicy, + const RegPressureTracker &RPTracker, SchedCandidate &Cand, + bool &PickedPending, bool IsBottomUp) { + assert(Zone.isTop() && "coexec scheduler only supports top boundary"); + assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling"); + + const SIRegisterInfo *SRI = static_cast(TRI); + ArrayRef Pressure = RPTracker.getRegSetPressureAtPos(); + unsigned SGPRPressure = 0; + unsigned VGPRPressure = 0; + PickedPending = false; + if (DAG->isTrackingPressure()) { + if (!useGCNTrackers()) { + SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; + VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; + } else { + SGPRPressure = DownwardTracker.getPressure().getSGPRNum(); + VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum(); + } + } + + auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) { + for (SUnit *SU : Q) { + SchedCandidate TryCand(ZonePolicy); + initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, + VGPRPressure, IsBottomUp); + SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; + tryCandidate(Cand, TryCand, ZoneArg); + if (TryCand.Reason != NoCand) { + if (TryCand.ResDelta == SchedResourceDelta()) + TryCand.initResourceDelta(Zone.DAG, SchedModel); + LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); + PickedPending = FromPending; + Cand.setBest(TryCand); + } else { + printCandidateDecision(TryCand, Cand); + } + } + }; + + LLVM_DEBUG(dbgs() << "Available Q:\n"); + EvaluateQueue(Zone.Available, /*FromPending=*/false); + + LLVM_DEBUG(dbgs() << "Pending Q:\n"); + EvaluateQueue(Zone.Pending, /*FromPending=*/true); +} + +bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand, + SchedCandidate &TryCand, + SchedBoundary *Zone) const { + // Initialize the candidate if needed. + if (!Cand.isValid()) { + TryCand.Reason = FirstValid; + return true; + } + + // Bias PhysReg Defs and copies to their uses and defined respectively. + if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), + biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) + return TryCand.Reason != NoCand; + + // Avoid exceeding the target's limit. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, + RegExcess, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + // We only compare a subset of features when comparing nodes between + // Top and Bottom boundary. Some properties are simply incomparable, in many + // other instances we should only override the other boundary if something + // is a clear good pick on one boundary. Skip heuristics that are more + // "tie-breaking" in nature. + bool SameBoundary = Zone != nullptr; + if (SameBoundary) { + // For loops that are acyclic path limited, aggressively schedule for + // latency. Within an single cycle, whenever CurrMOps > 0, allow normal + // heuristics to take precedence. + if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && + tryLatency(TryCand, Cand, *Zone)) + return TryCand.Reason != NoCand; + + // Prioritize instructions that read unbuffered resources by stall cycles. + if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), + Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) + return TryCand.Reason != NoCand; + } + + // Keep clustered nodes together to encourage downstream peephole + // optimizations which may reduce resource requirements. + // + // This is a best effort to set things up for a post-RA pass. Optimizations + // like generating loads of multiple registers should ideally be done within + // the scheduler pass by combining the loads during DAG postprocessing. + unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; + unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; + bool CandIsClusterSucc = + isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); + bool TryCandIsClusterSucc = + isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); + + if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, + Cluster)) + return TryCand.Reason != NoCand; + + if (SameBoundary) { + // Weak edges are for clustering and other constraints. + if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), + getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) + return TryCand.Reason != NoCand; + } + + // Avoid increasing the max pressure of the entire region. + if (DAG->isTrackingPressure() && + tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, + Cand, RegMax, TRI, DAG->MF)) + return TryCand.Reason != NoCand; + + if (SameBoundary) { + // Avoid critical resource consumption and balance the schedule. + TryCand.initResourceDelta(DAG, SchedModel); + if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, + TryCand, Cand, ResourceReduce)) + return TryCand.Reason != NoCand; + if (tryGreater(TryCand.ResDelta.DemandedResources, + Cand.ResDelta.DemandedResources, TryCand, Cand, + ResourceDemand)) + return TryCand.Reason != NoCand; + + // Avoid serializing long latency dependence chains. + // For acyclic path limited loops, latency was already checked above. + if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && + !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) + return TryCand.Reason != NoCand; + + // Fall through to original instruction order. + if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || + (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { + TryCand.Reason = NodeOrder; + return true; + } + } + + return false; +} + +ScheduleDAGInstrs * +llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) { + LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for " + << C->MF->getName() << '\n'); + return new GCNScheduleDAGMILive( + C, std::make_unique(C)); +} + +ScheduleDAGInstrs * +llvm::createGCNNoopPostMachineScheduler(MachineSchedContext *C) { + LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for " + << C->MF->getName() << '\n'); + return new GCNNoopPostScheduleDAG(C); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h new file mode 100644 index 000000000000..2b661f03aa50 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h @@ -0,0 +1,46 @@ +//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Coexecution-focused scheduling strategy for AMDGPU. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H + +#include "GCNSchedStrategy.h" +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy { +protected: + bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, + SchedBoundary *Zone) const override; + void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, + const RegPressureTracker &RPTracker, + SchedCandidate &Cand, bool &PickedPending, + bool IsBottomUp); + +public: + AMDGPUCoExecSchedStrategy(const MachineSchedContext *C); + + void initPolicy(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) override; + void initialize(ScheduleDAGMI *DAG) override; + SUnit *pickNode(bool &IsTopNode) override; +}; + +ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C); +ScheduleDAGInstrs *createGCNNoopPostMachineScheduler(MachineSchedContext *C); + +} // End namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 3d737d432af6..daa9f933fce5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -18,6 +18,7 @@ #include "AMDGPU.h" #include "AMDGPUAliasAnalysis.h" #include "AMDGPUBarrierLatency.h" +#include "AMDGPUCoExecSchedStrategy.h" #include "AMDGPUCtorDtorLowering.h" #include "AMDGPUExportClustering.h" #include "AMDGPUExportKernelRuntimeHandles.h" @@ -89,6 +90,7 @@ #include "llvm/CodeGen/PostRAHazardRecognizer.h" #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" @@ -574,6 +576,38 @@ static cl::opt cl::desc("Select custom AMDGPU scheduling strategy."), cl::Hidden, cl::init("")); +// Scheduler selection is consulted both when creating the scheduler and from +// overrideSchedPolicy(), so keep the attribute and global command line handling +// in one helper. +StringRef llvm::AMDGPU::getSchedStrategy(const Function &F) { + Attribute SchedStrategyAttr = F.getFnAttribute("amdgpu-sched-strategy"); + if (SchedStrategyAttr.isValid()) + return SchedStrategyAttr.getValueAsString(); + + if (!AMDGPUSchedStrategy.empty()) + return AMDGPUSchedStrategy; + + return ""; +} + +static void +diagnoseUnsupportedCoExecSchedulerSelection(const Function &F, + const GCNSubtarget &ST) { + if (ST.hasGFX1250Insts()) + return; + + F.getContext().diagnose(DiagnosticInfoUnsupported( + F, "'amdgpu-sched-strategy'='coexec' is only supported for gfx1250", + DiagnosticLocation(), DS_Warning)); +} + +static bool useNoopPostScheduler(const Function &F) { + Attribute PostSchedStrategyAttr = + F.getFnAttribute("amdgpu-post-sched-strategy"); + return PostSchedStrategyAttr.isValid() && + PostSchedStrategyAttr.getValueAsString() == "nop"; +} + static cl::opt EnableRewritePartialRegUses( "amdgpu-enable-rewrite-partial-reg-uses", cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), @@ -1244,11 +1278,7 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const { if (ST.enableSIScheduler()) return createSIMachineScheduler(C); - Attribute SchedStrategyAttr = - C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy"); - StringRef SchedStrategy = SchedStrategyAttr.isValid() - ? SchedStrategyAttr.getValueAsString() - : AMDGPUSchedStrategy; + StringRef SchedStrategy = AMDGPU::getSchedStrategy(C->MF->getFunction()); if (SchedStrategy == "max-ilp") return createGCNMaxILPMachineScheduler(C); @@ -1265,11 +1295,19 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const { if (SchedStrategy == "iterative-maxocc") return createIterativeGCNMaxOccupancyMachineScheduler(C); + if (SchedStrategy == "coexec") { + diagnoseUnsupportedCoExecSchedulerSelection(C->MF->getFunction(), ST); + return createGCNCoExecMachineScheduler(C); + } + return createGCNMaxOccupancyMachineScheduler(C); } ScheduleDAGInstrs * GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const { + if (useNoopPostScheduler(C->MF->getFunction())) + return createGCNNoopPostMachineScheduler(C); + ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(C, std::make_unique(C), /*RemoveKillFlags=*/true); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index e5cb867b95c7..a9e24acec045 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -27,6 +27,10 @@ namespace llvm { // AMDGPU Target Machine (R600+) //===----------------------------------------------------------------------===// +namespace AMDGPU { +StringRef getSchedStrategy(const Function &F); +} + class AMDGPUTargetMachine : public CodeGenTargetMachineImpl { protected: std::unique_ptr TLOF; diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 65603ea3d7cc..cda288e01292 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -86,6 +86,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp + AMDGPUCoExecSchedStrategy.cpp AMDGPUIGroupLP.cpp AMDGPULowerVGPREncoding.cpp AMDGPUMCResourceInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 932b9e8f52b2..7f1479586d7c 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -108,6 +108,8 @@ const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { + if (GCNTrackers.getNumOccurrences() > 0) + GCNTrackersOverride = GCNTrackers; } void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { @@ -196,7 +198,7 @@ static bool canUsePressureDiffs(const SUnit &SU) { return true; } -static void getRegisterPressures( +void GCNSchedStrategy::getRegisterPressures( bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector &Pressure, std::vector &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, @@ -204,7 +206,7 @@ static void getRegisterPressures( // getDownwardPressure() and getUpwardPressure() make temporary changes to // the tracker, so we need to pass those function a non-const copy. RegPressureTracker &TempTracker = const_cast(RPTracker); - if (!GCNTrackers) { + if (!useGCNTrackers()) { AtTop ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure) : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); @@ -256,7 +258,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, // // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of // PressureDiffs. - if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { + if (AtTop || !canUsePressureDiffs(*SU) || useGCNTrackers()) { getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, DownwardTracker, UpwardTracker, DAG, SRI); } else { @@ -400,7 +402,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, unsigned VGPRPressure = 0; IsPending = false; if (DAG->isTrackingPressure()) { - if (!GCNTrackers) { + if (!useGCNTrackers()) { SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; } else { @@ -623,7 +625,7 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { } void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { - if (GCNTrackers) { + if (useGCNTrackers()) { MachineInstr *MI = SU->getInstr(); IsTopNode ? (void)DownwardTracker.advance(MI, false) : UpwardTracker.recede(*MI); @@ -707,7 +709,8 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); SchedStages.push_back(GCNSchedStageID::PreRARematerialize); - GCNTrackers = GCNTrackers & !IsLegacyScheduler; + if (IsLegacyScheduler) + GCNTrackersOverride = std::nullopt; } GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) @@ -1136,9 +1139,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() { void GCNScheduleDAGMILive::runSchedStages() { LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); + GCNSchedStrategy &S = static_cast(*SchedImpl); if (!Regions.empty()) { BBLiveInMap = getRegionLiveInMap(); - if (GCNTrackers) + if (S.useGCNTrackers()) RegionLiveOuts.buildLiveRegMap(); } @@ -1150,7 +1154,6 @@ void GCNScheduleDAGMILive::runSchedStages() { } #endif - GCNSchedStrategy &S = static_cast(*SchedImpl); while (S.advanceStage()) { auto Stage = createSchedStage(S.getCurrentStage()); if (!Stage->initGCNSchedStage()) @@ -1166,7 +1169,7 @@ void GCNScheduleDAGMILive::runSchedStages() { continue; } - if (GCNTrackers) { + if (S.useGCNTrackers()) { GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); GCNRPTracker::LiveRegSet *RegionLiveIns = diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 99fd55db3328..4430503d441e 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -74,6 +74,13 @@ protected: void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred); + void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, + SUnit *SU, std::vector &Pressure, + std::vector &MaxPressure, + GCNDownwardRPTracker &DownwardTracker, + GCNUpwardRPTracker &UpwardTracker, + ScheduleDAGMI *DAG, const SIRegisterInfo *SRI); + std::vector Pressure; std::vector MaxPressure; @@ -98,6 +105,10 @@ protected: // GCN RP Tracker for botttom-up scheduling mutable GCNUpwardRPTracker UpwardTracker; + bool UseGCNTrackers = false; + + std::optional GCNTrackersOverride; + public: // schedule() have seen register pressure over the critical limits and had to // track register pressure for actual scheduling heuristics. @@ -145,6 +156,10 @@ public: bool hasNextStage() const; + bool useGCNTrackers() const { + return GCNTrackersOverride.value_or(UseGCNTrackers); + } + GCNSchedStageID getNextStage() const; GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index 77bb36e9e60a..c56a746d6300 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -338,6 +338,13 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, // SIRegisterInfo::getRegPressureSetLimit() Policy.ShouldTrackPressure = true; + const Function &F = Region.RegionBegin->getMF()->getFunction(); + if (AMDGPU::getSchedStrategy(F) == "coexec") { + Policy.OnlyTopDown = true; + Policy.OnlyBottomUp = false; + return; + } + // Enabling both top down and bottom up scheduling seems to give us less // register spills than just using one of these approaches on its own. Policy.OnlyTopDown = false; diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir new file mode 100644 index 000000000000..bac94bdffd37 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir @@ -0,0 +1,124 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -verify-misched %s -o - | FileCheck -check-prefix=DEFAULT %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec -verify-misched %s -o - | FileCheck -check-prefix=COEXEC %s + +--- | + ; Pre-commit test for stall heuristic + + define void @test-sched-effective-stall() #0 { ret void } + define void @test-sched-pending-structural-stall() #0 { ret void } + + attributes #0 = { "amdgpu-waves-per-eu"="1,1" } +... + +name: test-sched-effective-stall +tracksRegLiveness: true +body: | + bb.0: + ; DEFAULT-LABEL: name: test-sched-effective-stall + ; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; DEFAULT-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF5]], 0, 0, implicit $exec + ; DEFAULT-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF10:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; DEFAULT-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF6]], [[DEF7]], 0, [[DEF8]], [[DEF9]], [[DEF10]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; DEFAULT-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14 + ; + ; COEXEC-LABEL: name: test-sched-effective-stall + ; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec + ; COEXEC-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; COEXEC-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; COEXEC-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; COEXEC-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14 + %0:vreg_512_align2 = IMPLICIT_DEF + %1:vreg_512_align2 = IMPLICIT_DEF + %2:vreg_256_align2 = IMPLICIT_DEF + %3:vgpr_32_lo256 = IMPLICIT_DEF + %4:vgpr_32_lo256 = IMPLICIT_DEF + %5:vreg_512_align2 = IMPLICIT_DEF + %6:vreg_512_align2 = IMPLICIT_DEF + %7:vreg_256_align2 = IMPLICIT_DEF + %8:vgpr_32_lo256 = IMPLICIT_DEF + %9:vgpr_32_lo256 = IMPLICIT_DEF + %10:vreg_64_align2 = IMPLICIT_DEF + %11:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %10, 0, 0, implicit $exec + %12:vreg_64_align2 = V_PK_ADD_F32 8, %11, 8, %11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5, %6, 0, %7, %8, %9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + S_ENDPGM 0, implicit %12, implicit %13, implicit %14 +... + +--- +name: test-sched-pending-structural-stall +tracksRegLiveness: true +body: | + bb.0: + ; DEFAULT-LABEL: name: test-sched-pending-structural-stall + ; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; DEFAULT-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; DEFAULT-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; DEFAULT-NEXT: S_NOP 0 + ; DEFAULT-NEXT: S_NOP 0 + ; DEFAULT-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; DEFAULT-NEXT: S_ENDPGM 0, implicit %10, implicit %11 + ; + ; COEXEC-LABEL: name: test-sched-pending-structural-stall + ; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF + ; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; COEXEC-NEXT: S_NOP 0 + ; COEXEC-NEXT: S_NOP 0 + ; COEXEC-NEXT: S_ENDPGM 0, implicit %10, implicit %11 + %0:vreg_512_align2 = IMPLICIT_DEF + %1:vreg_512_align2 = IMPLICIT_DEF + %2:vreg_256_align2 = IMPLICIT_DEF + %3:vgpr_32_lo256 = IMPLICIT_DEF + %4:vgpr_32_lo256 = IMPLICIT_DEF + %5:vreg_512_align2 = IMPLICIT_DEF + %6:vreg_512_align2 = IMPLICIT_DEF + %7:vreg_256_align2 = IMPLICIT_DEF + %8:vgpr_32_lo256 = IMPLICIT_DEF + %9:vgpr_32_lo256 = IMPLICIT_DEF + %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5, %6, 0, %7, %8, %9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + S_NOP 0 + S_NOP 0 + S_ENDPGM 0, implicit %10, implicit %11 +... diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-warning.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-warning.mir new file mode 100644 index 000000000000..db615afae9a2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-warning.mir @@ -0,0 +1,20 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=GFX1100 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec %s -o /dev/null 2>&1 | FileCheck --allow-empty %s --check-prefix=GFX1250 + +# GFX1100: warning: {{.*}}'amdgpu-sched-strategy'='coexec' is only supported for gfx1250 +# GFX1250-NOT: warning: + +--- | + define void @coexec_sched_warning() #0 { ret void } + + attributes #0 = { "amdgpu-waves-per-eu"="1,1" } +... + +--- +name: coexec_sched_warning +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = IMPLICIT_DEF + S_ENDPGM 0, implicit %0 +...