[AMDGPU] Add ML-oriented coexec scheduler selection and queue handling (#169616)
This patch adds the initial coexec scheduler scaffold for machine learning workloads on gfx1250. It introduces function and module-level controls for selecting the AMDGPU preRA and postRA schedulers, including an `amdgpu-workload-type` module flag that maps ML workloads to coexec preRA scheduling and a nop postRA scheduler by default. It also updates the coexec scheduler to use a simplified top-down candidate selection path that considers both available and pending queues through a single flow, setting up follow-on heuristic work.
This commit is contained in:
parent
d69c670934
commit
3e4efe3ed4
283
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
Normal file
283
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
Normal file
@ -0,0 +1,283 @@
|
||||
//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// Coexecution-focused scheduling strategy for AMDGPU.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPUCoExecSchedStrategy.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "machine-scheduler"
|
||||
|
||||
namespace {
|
||||
|
||||
// Used to disable post-RA scheduling with function level granularity.
|
||||
class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
|
||||
public:
|
||||
explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
|
||||
: ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}
|
||||
|
||||
// Do nothing.
|
||||
void schedule() override {}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
|
||||
// pickOnlyChoice() releases pending instructions and checks for new hazards.
|
||||
SUnit *OnlyChoice = Zone.pickOnlyChoice();
|
||||
if (!Zone.Pending.empty())
|
||||
return nullptr;
|
||||
|
||||
return OnlyChoice;
|
||||
}
|
||||
|
||||
AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
|
||||
const MachineSchedContext *C)
|
||||
: GCNSchedStrategy(C) {
|
||||
SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
|
||||
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
|
||||
// Use more accurate GCN pressure trackers.
|
||||
UseGCNTrackers = true;
|
||||
}
|
||||
|
||||
void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
|
||||
MachineBasicBlock::iterator End,
|
||||
unsigned NumRegionInstrs) {
|
||||
GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
|
||||
assert((PreRADirection == MISched::Unspecified ||
|
||||
PreRADirection == MISched::TopDown) &&
|
||||
"coexec scheduler only supports top-down scheduling");
|
||||
RegionPolicy.OnlyTopDown = true;
|
||||
RegionPolicy.OnlyBottomUp = false;
|
||||
}
|
||||
|
||||
void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
|
||||
// Coexecution scheduling strategy is only done top-down to support new
|
||||
// resource balancing heuristics.
|
||||
RegionPolicy.OnlyTopDown = true;
|
||||
RegionPolicy.OnlyBottomUp = false;
|
||||
|
||||
GCNSchedStrategy::initialize(DAG);
|
||||
}
|
||||
|
||||
SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
|
||||
assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
|
||||
"coexec scheduler only supports top-down scheduling");
|
||||
|
||||
if (DAG->top() == DAG->bottom()) {
|
||||
assert(Top.Available.empty() && Top.Pending.empty() &&
|
||||
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
bool PickedPending = false;
|
||||
SUnit *SU = nullptr;
|
||||
do {
|
||||
PickedPending = false;
|
||||
SU = pickOnlyChoice(Top);
|
||||
if (!SU) {
|
||||
CandPolicy NoPolicy;
|
||||
TopCand.reset(NoPolicy);
|
||||
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
|
||||
PickedPending, /*IsBottomUp=*/false);
|
||||
assert(TopCand.Reason != NoCand && "failed to find a candidate");
|
||||
SU = TopCand.SU;
|
||||
}
|
||||
IsTopNode = true;
|
||||
} while (SU->isScheduled);
|
||||
|
||||
if (PickedPending) {
|
||||
unsigned ReadyCycle = SU->TopReadyCycle;
|
||||
unsigned CurrentCycle = Top.getCurrCycle();
|
||||
if (ReadyCycle > CurrentCycle)
|
||||
Top.bumpCycle(ReadyCycle);
|
||||
|
||||
// checkHazard() does not expose the exact cycle where the hazard clears.
|
||||
while (Top.checkHazard(SU))
|
||||
Top.bumpCycle(Top.getCurrCycle() + 1);
|
||||
|
||||
Top.releasePending();
|
||||
}
|
||||
|
||||
if (SU->isTopReady())
|
||||
Top.removeReady(SU);
|
||||
if (SU->isBottomReady())
|
||||
Bot.removeReady(SU);
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
|
||||
<< *SU->getInstr());
|
||||
|
||||
assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
|
||||
return SU;
|
||||
}
|
||||
|
||||
void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
|
||||
SchedBoundary &Zone, const CandPolicy &ZonePolicy,
|
||||
const RegPressureTracker &RPTracker, SchedCandidate &Cand,
|
||||
bool &PickedPending, bool IsBottomUp) {
|
||||
assert(Zone.isTop() && "coexec scheduler only supports top boundary");
|
||||
assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");
|
||||
|
||||
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
|
||||
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
|
||||
unsigned SGPRPressure = 0;
|
||||
unsigned VGPRPressure = 0;
|
||||
PickedPending = false;
|
||||
if (DAG->isTrackingPressure()) {
|
||||
if (!useGCNTrackers()) {
|
||||
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
|
||||
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
|
||||
} else {
|
||||
SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
|
||||
VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
|
||||
}
|
||||
}
|
||||
|
||||
auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
|
||||
for (SUnit *SU : Q) {
|
||||
SchedCandidate TryCand(ZonePolicy);
|
||||
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
|
||||
VGPRPressure, IsBottomUp);
|
||||
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
|
||||
tryCandidate(Cand, TryCand, ZoneArg);
|
||||
if (TryCand.Reason != NoCand) {
|
||||
if (TryCand.ResDelta == SchedResourceDelta())
|
||||
TryCand.initResourceDelta(Zone.DAG, SchedModel);
|
||||
LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
|
||||
PickedPending = FromPending;
|
||||
Cand.setBest(TryCand);
|
||||
} else {
|
||||
printCandidateDecision(TryCand, Cand);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Available Q:\n");
|
||||
EvaluateQueue(Zone.Available, /*FromPending=*/false);
|
||||
|
||||
LLVM_DEBUG(dbgs() << "Pending Q:\n");
|
||||
EvaluateQueue(Zone.Pending, /*FromPending=*/true);
|
||||
}
|
||||
|
||||
bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
|
||||
SchedCandidate &TryCand,
|
||||
SchedBoundary *Zone) const {
|
||||
// Initialize the candidate if needed.
|
||||
if (!Cand.isValid()) {
|
||||
TryCand.Reason = FirstValid;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Bias PhysReg Defs and copies to their uses and defined respectively.
|
||||
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
|
||||
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
// Avoid exceeding the target's limit.
|
||||
if (DAG->isTrackingPressure() &&
|
||||
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
|
||||
RegExcess, TRI, DAG->MF))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
// We only compare a subset of features when comparing nodes between
|
||||
// Top and Bottom boundary. Some properties are simply incomparable, in many
|
||||
// other instances we should only override the other boundary if something
|
||||
// is a clear good pick on one boundary. Skip heuristics that are more
|
||||
// "tie-breaking" in nature.
|
||||
bool SameBoundary = Zone != nullptr;
|
||||
if (SameBoundary) {
|
||||
// For loops that are acyclic path limited, aggressively schedule for
|
||||
// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
|
||||
// heuristics to take precedence.
|
||||
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
|
||||
tryLatency(TryCand, Cand, *Zone))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
// Prioritize instructions that read unbuffered resources by stall cycles.
|
||||
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
|
||||
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
|
||||
return TryCand.Reason != NoCand;
|
||||
}
|
||||
|
||||
// Keep clustered nodes together to encourage downstream peephole
|
||||
// optimizations which may reduce resource requirements.
|
||||
//
|
||||
// This is a best effort to set things up for a post-RA pass. Optimizations
|
||||
// like generating loads of multiple registers should ideally be done within
|
||||
// the scheduler pass by combining the loads during DAG postprocessing.
|
||||
unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
|
||||
unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
|
||||
bool CandIsClusterSucc =
|
||||
isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
|
||||
bool TryCandIsClusterSucc =
|
||||
isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
|
||||
|
||||
if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
|
||||
Cluster))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
if (SameBoundary) {
|
||||
// Weak edges are for clustering and other constraints.
|
||||
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
|
||||
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
|
||||
return TryCand.Reason != NoCand;
|
||||
}
|
||||
|
||||
// Avoid increasing the max pressure of the entire region.
|
||||
if (DAG->isTrackingPressure() &&
|
||||
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
|
||||
Cand, RegMax, TRI, DAG->MF))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
if (SameBoundary) {
|
||||
// Avoid critical resource consumption and balance the schedule.
|
||||
TryCand.initResourceDelta(DAG, SchedModel);
|
||||
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
|
||||
TryCand, Cand, ResourceReduce))
|
||||
return TryCand.Reason != NoCand;
|
||||
if (tryGreater(TryCand.ResDelta.DemandedResources,
|
||||
Cand.ResDelta.DemandedResources, TryCand, Cand,
|
||||
ResourceDemand))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
// Avoid serializing long latency dependence chains.
|
||||
// For acyclic path limited loops, latency was already checked above.
|
||||
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
|
||||
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
// Fall through to original instruction order.
|
||||
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
|
||||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
|
||||
TryCand.Reason = NodeOrder;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
ScheduleDAGInstrs *
|
||||
llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
|
||||
LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
|
||||
<< C->MF->getName() << '\n');
|
||||
return new GCNScheduleDAGMILive(
|
||||
C, std::make_unique<AMDGPUCoExecSchedStrategy>(C));
|
||||
}
|
||||
|
||||
ScheduleDAGInstrs *
|
||||
llvm::createGCNNoopPostMachineScheduler(MachineSchedContext *C) {
|
||||
LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
|
||||
<< C->MF->getName() << '\n');
|
||||
return new GCNNoopPostScheduleDAG(C);
|
||||
}
|
||||
46
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
Normal file
46
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
Normal file
@ -0,0 +1,46 @@
|
||||
//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// Coexecution-focused scheduling strategy for AMDGPU.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
|
||||
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
|
||||
|
||||
#include "GCNSchedStrategy.h"
|
||||
#include "llvm/CodeGen/MachineScheduler.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
|
||||
protected:
|
||||
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
|
||||
SchedBoundary *Zone) const override;
|
||||
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
|
||||
const RegPressureTracker &RPTracker,
|
||||
SchedCandidate &Cand, bool &PickedPending,
|
||||
bool IsBottomUp);
|
||||
|
||||
public:
|
||||
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);
|
||||
|
||||
void initPolicy(MachineBasicBlock::iterator Begin,
|
||||
MachineBasicBlock::iterator End,
|
||||
unsigned NumRegionInstrs) override;
|
||||
void initialize(ScheduleDAGMI *DAG) override;
|
||||
SUnit *pickNode(bool &IsTopNode) override;
|
||||
};
|
||||
|
||||
ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
|
||||
ScheduleDAGInstrs *createGCNNoopPostMachineScheduler(MachineSchedContext *C);
|
||||
|
||||
} // End namespace llvm
|
||||
|
||||
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
|
||||
@ -18,6 +18,7 @@
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPUAliasAnalysis.h"
|
||||
#include "AMDGPUBarrierLatency.h"
|
||||
#include "AMDGPUCoExecSchedStrategy.h"
|
||||
#include "AMDGPUCtorDtorLowering.h"
|
||||
#include "AMDGPUExportClustering.h"
|
||||
#include "AMDGPUExportKernelRuntimeHandles.h"
|
||||
@ -89,6 +90,7 @@
|
||||
#include "llvm/CodeGen/PostRAHazardRecognizer.h"
|
||||
#include "llvm/CodeGen/RegAllocRegistry.h"
|
||||
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||
#include "llvm/IR/DiagnosticInfo.h"
|
||||
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
||||
#include "llvm/IR/PassManager.h"
|
||||
#include "llvm/IR/PatternMatch.h"
|
||||
@ -574,6 +576,38 @@ static cl::opt<std::string>
|
||||
cl::desc("Select custom AMDGPU scheduling strategy."),
|
||||
cl::Hidden, cl::init(""));
|
||||
|
||||
// Scheduler selection is consulted both when creating the scheduler and from
|
||||
// overrideSchedPolicy(), so keep the attribute and global command line handling
|
||||
// in one helper.
|
||||
StringRef llvm::AMDGPU::getSchedStrategy(const Function &F) {
|
||||
Attribute SchedStrategyAttr = F.getFnAttribute("amdgpu-sched-strategy");
|
||||
if (SchedStrategyAttr.isValid())
|
||||
return SchedStrategyAttr.getValueAsString();
|
||||
|
||||
if (!AMDGPUSchedStrategy.empty())
|
||||
return AMDGPUSchedStrategy;
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
static void
|
||||
diagnoseUnsupportedCoExecSchedulerSelection(const Function &F,
|
||||
const GCNSubtarget &ST) {
|
||||
if (ST.hasGFX1250Insts())
|
||||
return;
|
||||
|
||||
F.getContext().diagnose(DiagnosticInfoUnsupported(
|
||||
F, "'amdgpu-sched-strategy'='coexec' is only supported for gfx1250",
|
||||
DiagnosticLocation(), DS_Warning));
|
||||
}
|
||||
|
||||
static bool useNoopPostScheduler(const Function &F) {
|
||||
Attribute PostSchedStrategyAttr =
|
||||
F.getFnAttribute("amdgpu-post-sched-strategy");
|
||||
return PostSchedStrategyAttr.isValid() &&
|
||||
PostSchedStrategyAttr.getValueAsString() == "nop";
|
||||
}
|
||||
|
||||
static cl::opt<bool> EnableRewritePartialRegUses(
|
||||
"amdgpu-enable-rewrite-partial-reg-uses",
|
||||
cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
|
||||
@ -1244,11 +1278,7 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
|
||||
if (ST.enableSIScheduler())
|
||||
return createSIMachineScheduler(C);
|
||||
|
||||
Attribute SchedStrategyAttr =
|
||||
C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
|
||||
StringRef SchedStrategy = SchedStrategyAttr.isValid()
|
||||
? SchedStrategyAttr.getValueAsString()
|
||||
: AMDGPUSchedStrategy;
|
||||
StringRef SchedStrategy = AMDGPU::getSchedStrategy(C->MF->getFunction());
|
||||
|
||||
if (SchedStrategy == "max-ilp")
|
||||
return createGCNMaxILPMachineScheduler(C);
|
||||
@ -1265,11 +1295,19 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
|
||||
if (SchedStrategy == "iterative-maxocc")
|
||||
return createIterativeGCNMaxOccupancyMachineScheduler(C);
|
||||
|
||||
if (SchedStrategy == "coexec") {
|
||||
diagnoseUnsupportedCoExecSchedulerSelection(C->MF->getFunction(), ST);
|
||||
return createGCNCoExecMachineScheduler(C);
|
||||
}
|
||||
|
||||
return createGCNMaxOccupancyMachineScheduler(C);
|
||||
}
|
||||
|
||||
ScheduleDAGInstrs *
|
||||
GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
|
||||
if (useNoopPostScheduler(C->MF->getFunction()))
|
||||
return createGCNNoopPostMachineScheduler(C);
|
||||
|
||||
ScheduleDAGMI *DAG =
|
||||
new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(C),
|
||||
/*RemoveKillFlags=*/true);
|
||||
|
||||
@ -27,6 +27,10 @@ namespace llvm {
|
||||
// AMDGPU Target Machine (R600+)
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
namespace AMDGPU {
|
||||
StringRef getSchedStrategy(const Function &F);
|
||||
}
|
||||
|
||||
class AMDGPUTargetMachine : public CodeGenTargetMachineImpl {
|
||||
protected:
|
||||
std::unique_ptr<TargetLoweringObjectFile> TLOF;
|
||||
|
||||
@ -86,6 +86,7 @@ add_llvm_target(AMDGPUCodeGen
|
||||
AMDGPUMacroFusion.cpp
|
||||
AMDGPUMCInstLower.cpp
|
||||
AMDGPUMemoryUtils.cpp
|
||||
AMDGPUCoExecSchedStrategy.cpp
|
||||
AMDGPUIGroupLP.cpp
|
||||
AMDGPULowerVGPREncoding.cpp
|
||||
AMDGPUMCResourceInfo.cpp
|
||||
|
||||
@ -108,6 +108,8 @@ const unsigned ScheduleMetrics::ScaleFactor = 100;
|
||||
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
|
||||
: GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
|
||||
DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
|
||||
if (GCNTrackers.getNumOccurrences() > 0)
|
||||
GCNTrackersOverride = GCNTrackers;
|
||||
}
|
||||
|
||||
void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
|
||||
@ -196,7 +198,7 @@ static bool canUsePressureDiffs(const SUnit &SU) {
|
||||
return true;
|
||||
}
|
||||
|
||||
static void getRegisterPressures(
|
||||
void GCNSchedStrategy::getRegisterPressures(
|
||||
bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
|
||||
std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
|
||||
GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
|
||||
@ -204,7 +206,7 @@ static void getRegisterPressures(
|
||||
// getDownwardPressure() and getUpwardPressure() make temporary changes to
|
||||
// the tracker, so we need to pass those function a non-const copy.
|
||||
RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
|
||||
if (!GCNTrackers) {
|
||||
if (!useGCNTrackers()) {
|
||||
AtTop
|
||||
? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
|
||||
: TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
|
||||
@ -256,7 +258,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
|
||||
//
|
||||
// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
|
||||
// PressureDiffs.
|
||||
if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
|
||||
if (AtTop || !canUsePressureDiffs(*SU) || useGCNTrackers()) {
|
||||
getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
|
||||
DownwardTracker, UpwardTracker, DAG, SRI);
|
||||
} else {
|
||||
@ -400,7 +402,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
|
||||
unsigned VGPRPressure = 0;
|
||||
IsPending = false;
|
||||
if (DAG->isTrackingPressure()) {
|
||||
if (!GCNTrackers) {
|
||||
if (!useGCNTrackers()) {
|
||||
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
|
||||
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
|
||||
} else {
|
||||
@ -623,7 +625,7 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
|
||||
}
|
||||
|
||||
void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
|
||||
if (GCNTrackers) {
|
||||
if (useGCNTrackers()) {
|
||||
MachineInstr *MI = SU->getInstr();
|
||||
IsTopNode ? (void)DownwardTracker.advance(MI, false)
|
||||
: UpwardTracker.recede(*MI);
|
||||
@ -707,7 +709,8 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
|
||||
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
|
||||
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
|
||||
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
|
||||
GCNTrackers = GCNTrackers & !IsLegacyScheduler;
|
||||
if (IsLegacyScheduler)
|
||||
GCNTrackersOverride = std::nullopt;
|
||||
}
|
||||
|
||||
GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
|
||||
@ -1136,9 +1139,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
|
||||
void GCNScheduleDAGMILive::runSchedStages() {
|
||||
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
|
||||
|
||||
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
|
||||
if (!Regions.empty()) {
|
||||
BBLiveInMap = getRegionLiveInMap();
|
||||
if (GCNTrackers)
|
||||
if (S.useGCNTrackers())
|
||||
RegionLiveOuts.buildLiveRegMap();
|
||||
}
|
||||
|
||||
@ -1150,7 +1154,6 @@ void GCNScheduleDAGMILive::runSchedStages() {
|
||||
}
|
||||
#endif
|
||||
|
||||
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
|
||||
while (S.advanceStage()) {
|
||||
auto Stage = createSchedStage(S.getCurrentStage());
|
||||
if (!Stage->initGCNSchedStage())
|
||||
@ -1166,7 +1169,7 @@ void GCNScheduleDAGMILive::runSchedStages() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (GCNTrackers) {
|
||||
if (S.useGCNTrackers()) {
|
||||
GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
|
||||
GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
|
||||
GCNRPTracker::LiveRegSet *RegionLiveIns =
|
||||
|
||||
@ -74,6 +74,13 @@ protected:
|
||||
void printCandidateDecision(const SchedCandidate &Current,
|
||||
const SchedCandidate &Preferred);
|
||||
|
||||
void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker,
|
||||
SUnit *SU, std::vector<unsigned> &Pressure,
|
||||
std::vector<unsigned> &MaxPressure,
|
||||
GCNDownwardRPTracker &DownwardTracker,
|
||||
GCNUpwardRPTracker &UpwardTracker,
|
||||
ScheduleDAGMI *DAG, const SIRegisterInfo *SRI);
|
||||
|
||||
std::vector<unsigned> Pressure;
|
||||
|
||||
std::vector<unsigned> MaxPressure;
|
||||
@ -98,6 +105,10 @@ protected:
|
||||
// GCN RP Tracker for botttom-up scheduling
|
||||
mutable GCNUpwardRPTracker UpwardTracker;
|
||||
|
||||
bool UseGCNTrackers = false;
|
||||
|
||||
std::optional<bool> GCNTrackersOverride;
|
||||
|
||||
public:
|
||||
// schedule() have seen register pressure over the critical limits and had to
|
||||
// track register pressure for actual scheduling heuristics.
|
||||
@ -145,6 +156,10 @@ public:
|
||||
|
||||
bool hasNextStage() const;
|
||||
|
||||
bool useGCNTrackers() const {
|
||||
return GCNTrackersOverride.value_or(UseGCNTrackers);
|
||||
}
|
||||
|
||||
GCNSchedStageID getNextStage() const;
|
||||
|
||||
GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; }
|
||||
|
||||
@ -338,6 +338,13 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
|
||||
// SIRegisterInfo::getRegPressureSetLimit()
|
||||
Policy.ShouldTrackPressure = true;
|
||||
|
||||
const Function &F = Region.RegionBegin->getMF()->getFunction();
|
||||
if (AMDGPU::getSchedStrategy(F) == "coexec") {
|
||||
Policy.OnlyTopDown = true;
|
||||
Policy.OnlyBottomUp = false;
|
||||
return;
|
||||
}
|
||||
|
||||
// Enabling both top down and bottom up scheduling seems to give us less
|
||||
// register spills than just using one of these approaches on its own.
|
||||
Policy.OnlyTopDown = false;
|
||||
|
||||
124
llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
Normal file
124
llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
Normal file
@ -0,0 +1,124 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -verify-misched %s -o - | FileCheck -check-prefix=DEFAULT %s
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec -verify-misched %s -o - | FileCheck -check-prefix=COEXEC %s
|
||||
|
||||
--- |
|
||||
; Pre-commit test for stall heuristic
|
||||
|
||||
define void @test-sched-effective-stall() #0 { ret void }
|
||||
define void @test-sched-pending-structural-stall() #0 { ret void }
|
||||
|
||||
attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
|
||||
...
|
||||
|
||||
name: test-sched-effective-stall
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
; DEFAULT-LABEL: name: test-sched-effective-stall
|
||||
; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF5]], 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF10:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; DEFAULT-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF6]], [[DEF7]], 0, [[DEF8]], [[DEF9]], [[DEF10]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
|
||||
;
|
||||
; COEXEC-LABEL: name: test-sched-effective-stall
|
||||
; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
|
||||
; COEXEC-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
; COEXEC-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
; COEXEC-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
|
||||
; COEXEC-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
|
||||
%0:vreg_512_align2 = IMPLICIT_DEF
|
||||
%1:vreg_512_align2 = IMPLICIT_DEF
|
||||
%2:vreg_256_align2 = IMPLICIT_DEF
|
||||
%3:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
%4:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
%5:vreg_512_align2 = IMPLICIT_DEF
|
||||
%6:vreg_512_align2 = IMPLICIT_DEF
|
||||
%7:vreg_256_align2 = IMPLICIT_DEF
|
||||
%8:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
%9:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
%10:vreg_64_align2 = IMPLICIT_DEF
|
||||
%11:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %10, 0, 0, implicit $exec
|
||||
%12:vreg_64_align2 = V_PK_ADD_F32 8, %11, 8, %11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
|
||||
%13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
%14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5, %6, 0, %7, %8, %9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
S_ENDPGM 0, implicit %12, implicit %13, implicit %14
|
||||
...
|
||||
|
||||
---
|
||||
name: test-sched-pending-structural-stall
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
; DEFAULT-LABEL: name: test-sched-pending-structural-stall
|
||||
; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; DEFAULT-NEXT: S_NOP 0
|
||||
; DEFAULT-NEXT: S_NOP 0
|
||||
; DEFAULT-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
; DEFAULT-NEXT: S_ENDPGM 0, implicit %10, implicit %11
|
||||
;
|
||||
; COEXEC-LABEL: name: test-sched-pending-structural-stall
|
||||
; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
; COEXEC-NEXT: S_NOP 0
|
||||
; COEXEC-NEXT: S_NOP 0
|
||||
; COEXEC-NEXT: S_ENDPGM 0, implicit %10, implicit %11
|
||||
%0:vreg_512_align2 = IMPLICIT_DEF
|
||||
%1:vreg_512_align2 = IMPLICIT_DEF
|
||||
%2:vreg_256_align2 = IMPLICIT_DEF
|
||||
%3:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
%4:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
%5:vreg_512_align2 = IMPLICIT_DEF
|
||||
%6:vreg_512_align2 = IMPLICIT_DEF
|
||||
%7:vreg_256_align2 = IMPLICIT_DEF
|
||||
%8:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
%9:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
%10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
%11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5, %6, 0, %7, %8, %9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
S_NOP 0
|
||||
S_NOP 0
|
||||
S_ENDPGM 0, implicit %10, implicit %11
|
||||
...
|
||||
20
llvm/test/CodeGen/AMDGPU/coexec-sched-warning.mir
Normal file
20
llvm/test/CodeGen/AMDGPU/coexec-sched-warning.mir
Normal file
@ -0,0 +1,20 @@
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=GFX1100
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec %s -o /dev/null 2>&1 | FileCheck --allow-empty %s --check-prefix=GFX1250
|
||||
|
||||
# GFX1100: warning: {{.*}}'amdgpu-sched-strategy'='coexec' is only supported for gfx1250
|
||||
# GFX1250-NOT: warning:
|
||||
|
||||
--- |
|
||||
define void @coexec_sched_warning() #0 { ret void }
|
||||
|
||||
attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
|
||||
...
|
||||
|
||||
---
|
||||
name: coexec_sched_warning
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
%0:vgpr_32 = IMPLICIT_DEF
|
||||
S_ENDPGM 0, implicit %0
|
||||
...
|
||||
Loading…
x
Reference in New Issue
Block a user