llvm-project/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
Jonas Paulsson 56a4315ee0
[SystemZ] Add a SystemZ specific pre-RA scheduling strategy. (#135076)
This is a relatively simple strategy as it is omitting any heuristics for
liveness and register pressure reduction. This works well as the SystemZ ISel
scheduler is using Sched::RegPressure which gives a good input order to begin
with.

It is trying harder with biasing phys regs than GenericScheduler as it also
considers other instructions such as immediate loads directly into phys-regs
produced by the register coalescer. This can hopefully be refactored into 
MachineScheduler.cpp.

It has a latency heuristic that is slightly different from the one in
GenericScheduler: It is activated for a specific type of region that have
many "data sequences" consisting of SUs connected only with a single
data-edge that are next to each other in the input order. This is only 3% of
all the scheduling regions, but when activated it is applied on all the
candidates (not just once per cycle). At the same time it is a bit more
careful by checking not only the SU Height against the scheduled latency but
also its Depth against the remaining latency.

It reuses the GenericScheduler handling of weak edges to help copy
coalescing.

It also helps with compare zero elimination as it tries to put a CC-defining
instruction that produces the compare source value above the compare before
any other instruction clobbering CC or the value.

This work was started after observing heavy spilling in Cactus, which was
actually *caused* by GenericScheduler - disabling it (no pre-RA scheduling)
remedied it and gave a 7% improvement in performance on that benchmark. Many
different versions have been tried which has evolved into this initial
simplistic MachineSchedStrategy that does relatively little and yet achieves
double-digit improvements on Cactus and Imagick compared to GenericSched
(which is OTOH 3% better on Blender). There will hopefully be more
improvements added later on as there seems to be potential for it.

It would be very interesting to have other OOO targets try this as well and
perhaps make this available in MachineScheduler.cpp

(A first attempt with improving the pre-RA scheduling was made with #90181,
which however did not materialize in anything actually useful.)
2026-03-10 15:38:05 +01:00

437 lines
15 KiB
C++

//-- SystemZMachineScheduler.cpp - SystemZ Scheduler Interface -*- C++ -*---==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "SystemZMachineScheduler.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
using namespace llvm;
#define DEBUG_TYPE "machine-scheduler"
/// Pre-RA scheduling ///
static bool isRegDef(const MachineOperand &MO) {
return MO.isReg() && MO.isDef();
}
static bool isPhysRegDef(const MachineOperand &MO) {
return isRegDef(MO) && MO.getReg().isPhysical();
}
void SystemZPreRASchedStrategy::initializeLatencyReduction() {
// Enable latency reduction for a region that has a considerable amount of
// data sequences that should be interlaved. These are SUs that only have
// one data predecessor / successor edge(s) to their adjacent instruction(s)
// in the input order. Disable if region has many SUs relative to the
// overall height.
unsigned DAGHeight = 0;
for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx)
DAGHeight = std::max(DAGHeight, DAG->SUnits[Idx].getHeight());
RegionPolicy.DisableLatencyHeuristic =
DAG->SUnits.size() >= 3 * std::max(DAGHeight, 1u);
if ((HasDataSequences = !RegionPolicy.DisableLatencyHeuristic)) {
unsigned CurrSequence = 0, NumSeqNodes = 0;
auto countSequence = [&CurrSequence, &NumSeqNodes]() {
if (CurrSequence >= 2)
NumSeqNodes += CurrSequence;
CurrSequence = 0;
};
for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
const SUnit *SU = &DAG->SUnits[Idx];
bool InDataSequence = true;
// One Data pred to MI just above, or no preds.
unsigned NumPreds = 0;
for (const SDep &Pred : SU->Preds)
if (++NumPreds != 1 || Pred.getKind() != SDep::Data ||
Pred.getSUnit()->NodeNum != Idx - 1)
InDataSequence = false;
// One Data succ or no succs (ignoring ExitSU).
unsigned NumSuccs = 0;
for (const SDep &Succ : SU->Succs)
if (Succ.getSUnit() != &DAG->ExitSU &&
(++NumSuccs != 1 || Succ.getKind() != SDep::Data))
InDataSequence = false;
// Another type of node or one that does not have a single data pred
// ends any previous sequence.
if (!InDataSequence || !NumPreds)
countSequence();
if (InDataSequence)
CurrSequence++;
}
countSequence();
if (NumSeqNodes >= std::max(size_t(4), DAG->SUnits.size() / 4)) {
LLVM_DEBUG(dbgs() << "Number of nodes in def-use sequences: "
<< NumSeqNodes << ". ";);
} else
HasDataSequences = false;
}
}
bool SystemZPreRASchedStrategy::definesCmp0Src(const MachineInstr *MI,
bool CCDef) const {
if (Cmp0SrcReg != SystemZ::NoRegister && MI->getNumOperands() &&
(MI->getDesc().hasImplicitDefOfPhysReg(SystemZ::CC) || !CCDef)) {
const MachineOperand &MO0 = MI->getOperand(0);
assert(!isPhysRegDef(MO0) && "Did not expect physreg def!");
if (isRegDef(MO0) && MO0.getReg() == Cmp0SrcReg)
return true;
}
return false;
}
static int biasPhysRegExtra(const SUnit *SU) {
if (int Res = biasPhysReg(SU, /*isTop=*/false))
return Res;
// Also recognize Load Address. Most of these are with an FI operand.
const MachineInstr *MI = SU->getInstr();
return MI->getNumOperands() && !MI->isCopy() &&
isPhysRegDef(MI->getOperand(0));
}
bool SystemZPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
assert(Zone && !Zone->isTop() && "Bottom-Up scheduling only.");
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = FirstValid;
return true;
}
// Bias physreg defs and copies to their uses and definitions respectively.
int TryCandPRegBias = biasPhysRegExtra(TryCand.SU);
int CandPRegBias = biasPhysRegExtra(Cand.SU);
if (tryGreater(TryCandPRegBias, CandPRegBias, TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;
if (TryCandPRegBias && CandPRegBias) {
// Both biased same way.
tryGreater(TryCand.SU->NodeNum, Cand.SU->NodeNum, TryCand, Cand, NodeOrder);
return TryCand.Reason != NoCand;
}
// Don't extend the scheduled latency in regions with many nodes in data
// sequences, or for (single block loop) regions that are acyclically
// (within a single loop iteration) latency limited. IsAcyclicLatencyLimited
// is set only after initialization in registerRoots(), which is why it is
// checked here instead of earlier.
if (!RegionPolicy.DisableLatencyHeuristic &&
(HasDataSequences || Rem.IsAcyclicLatencyLimited))
if (const SUnit *HigherSU =
TryCand.SU->getHeight() > Cand.SU->getHeight() ? TryCand.SU
: TryCand.SU->getHeight() < Cand.SU->getHeight() ? Cand.SU
: nullptr)
if (HigherSU->getHeight() > Zone->getScheduledLatency() &&
HigherSU->getDepth() < computeRemLatency(*Zone)) {
// One or both SUs increase the scheduled latency.
tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(), TryCand, Cand,
GenericSchedulerBase::BotHeightReduce);
return TryCand.Reason != NoCand;
}
// Weak edges help copy coalescing.
if (tryLess(TryCand.SU->WeakSuccsLeft, Cand.SU->WeakSuccsLeft, TryCand, Cand,
Weak))
return TryCand.Reason != NoCand;
// Help compare with zero elimination.
if (tryGreater(definesCmp0Src(TryCand.SU->getInstr()),
definesCmp0Src(Cand.SU->getInstr()), TryCand, Cand, Weak))
return TryCand.Reason != NoCand;
// Fall through to original instruction order.
if (TryCand.SU->NodeNum > Cand.SU->NodeNum) {
TryCand.Reason = NodeOrder;
return true;
}
return false;
}
void SystemZPreRASchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
unsigned NumRegionInstrs) {
// Avoid setting up the register pressure tracker for small regions to save
// compile time. Currently only used for computeCyclicCriticalPath() which
// is used for single block loops.
MachineBasicBlock *MBB = Begin->getParent();
RegionPolicy.ShouldTrackPressure =
MBB->isSuccessor(MBB) && NumRegionInstrs >= 8;
// These heuristics has so far seemed to work better without adding a
// top-down boundary.
RegionPolicy.OnlyBottomUp = true;
BotIdx = NumRegionInstrs - 1;
this->NumRegionInstrs = NumRegionInstrs;
}
void SystemZPreRASchedStrategy::initialize(ScheduleDAGMI *dag) {
GenericScheduler::initialize(dag);
Cmp0SrcReg = SystemZ::NoRegister;
initializeLatencyReduction();
LLVM_DEBUG(dbgs() << "Latency scheduling " << (HasDataSequences ? "" : "not ")
<< "enabled for data sequences.\n";);
}
void SystemZPreRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
GenericScheduler::schedNode(SU, IsTopNode);
const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(DAG->TII);
MachineInstr *MI = SU->getInstr();
if (TII->isCompareZero(*MI))
Cmp0SrcReg = TII->getCompareSourceReg(*MI);
else if (MI->getDesc().hasImplicitDefOfPhysReg(SystemZ::CC) ||
definesCmp0Src(MI, /*CCDef=*/false))
Cmp0SrcReg = SystemZ::NoRegister;
}
/// Post-RA scheduling ///
#ifndef NDEBUG
// Print the set of SUs
void SystemZPostRASchedStrategy::SUSet::
dump(SystemZHazardRecognizer &HazardRec) const {
dbgs() << "{";
for (auto &SU : *this) {
HazardRec.dumpSU(SU, dbgs());
if (SU != *rbegin())
dbgs() << ", ";
}
dbgs() << "}\n";
}
#endif
// Try to find a single predecessor that would be interesting for the
// scheduler in the top-most region of MBB.
static MachineBasicBlock *getSingleSchedPred(MachineBasicBlock *MBB,
const MachineLoop *Loop) {
MachineBasicBlock *PredMBB = nullptr;
if (MBB->pred_size() == 1)
PredMBB = *MBB->pred_begin();
// The loop header has two predecessors, return the latch, but not for a
// single block loop.
if (MBB->pred_size() == 2 && Loop != nullptr && Loop->getHeader() == MBB) {
for (MachineBasicBlock *Pred : MBB->predecessors())
if (Loop->contains(Pred))
PredMBB = (Pred == MBB ? nullptr : Pred);
}
assert ((PredMBB == nullptr || !Loop || Loop->contains(PredMBB))
&& "Loop MBB should not consider predecessor outside of loop.");
return PredMBB;
}
void SystemZPostRASchedStrategy::
advanceTo(MachineBasicBlock::iterator NextBegin) {
MachineBasicBlock::iterator LastEmittedMI = HazardRec->getLastEmittedMI();
MachineBasicBlock::iterator I =
((LastEmittedMI != nullptr && LastEmittedMI->getParent() == MBB) ?
std::next(LastEmittedMI) : MBB->begin());
for (; I != NextBegin; ++I) {
if (I->isPosition() || I->isDebugInstr())
continue;
HazardRec->emitInstruction(&*I);
}
}
void SystemZPostRASchedStrategy::initialize(ScheduleDAGMI *dag) {
Available.clear(); // -misched-cutoff.
LLVM_DEBUG(HazardRec->dumpState(););
}
void SystemZPostRASchedStrategy::enterMBB(MachineBasicBlock *NextMBB) {
assert ((SchedStates.find(NextMBB) == SchedStates.end()) &&
"Entering MBB twice?");
LLVM_DEBUG(dbgs() << "** Entering " << printMBBReference(*NextMBB));
MBB = NextMBB;
/// Create a HazardRec for MBB, save it in SchedStates and set HazardRec to
/// point to it.
HazardRec = SchedStates[MBB] = new SystemZHazardRecognizer(TII, &SchedModel);
LLVM_DEBUG(const MachineLoop *Loop = MLI->getLoopFor(MBB);
if (Loop && Loop->getHeader() == MBB) dbgs() << " (Loop header)";
dbgs() << ":\n";);
// Try to take over the state from a single predecessor, if it has been
// scheduled. If this is not possible, we are done.
MachineBasicBlock *SinglePredMBB =
getSingleSchedPred(MBB, MLI->getLoopFor(MBB));
if (SinglePredMBB == nullptr)
return;
auto It = SchedStates.find(SinglePredMBB);
if (It == SchedStates.end())
return;
LLVM_DEBUG(dbgs() << "** Continued scheduling from "
<< printMBBReference(*SinglePredMBB) << "\n";);
HazardRec->copyState(It->second);
LLVM_DEBUG(HazardRec->dumpState(););
// Emit incoming terminator(s). Be optimistic and assume that branch
// prediction will generally do "the right thing".
for (MachineInstr &MI : SinglePredMBB->terminators()) {
LLVM_DEBUG(dbgs() << "** Emitting incoming branch: "; MI.dump(););
bool TakenBranch = (MI.isBranch() &&
(TII->getBranchInfo(MI).isIndirect() ||
TII->getBranchInfo(MI).getMBBTarget() == MBB));
HazardRec->emitInstruction(&MI, TakenBranch);
if (TakenBranch)
break;
}
}
void SystemZPostRASchedStrategy::leaveMBB() {
LLVM_DEBUG(dbgs() << "** Leaving " << printMBBReference(*MBB) << "\n";);
// Advance to first terminator. The successor block will handle terminators
// dependent on CFG layout (T/NT branch etc).
advanceTo(MBB->getFirstTerminator());
}
SystemZPostRASchedStrategy::
SystemZPostRASchedStrategy(const MachineSchedContext *C)
: MLI(C->MLI),
TII(static_cast<const SystemZInstrInfo *>
(C->MF->getSubtarget().getInstrInfo())),
MBB(nullptr), HazardRec(nullptr) {
const TargetSubtargetInfo *ST = &C->MF->getSubtarget();
SchedModel.init(ST);
}
SystemZPostRASchedStrategy::~SystemZPostRASchedStrategy() {
// Delete hazard recognizers kept around for each MBB.
for (auto I : SchedStates) {
SystemZHazardRecognizer *hazrec = I.second;
delete hazrec;
}
}
void SystemZPostRASchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
unsigned NumRegionInstrs) {
// Don't emit the terminators.
if (Begin->isTerminator())
return;
// Emit any instructions before start of region.
advanceTo(Begin);
}
// Pick the next node to schedule.
SUnit *SystemZPostRASchedStrategy::pickNode(bool &IsTopNode) {
// Only scheduling top-down.
IsTopNode = true;
if (Available.empty())
return nullptr;
// If only one choice, return it.
if (Available.size() == 1) {
LLVM_DEBUG(dbgs() << "** Only one: ";
HazardRec->dumpSU(*Available.begin(), dbgs()); dbgs() << "\n";);
return *Available.begin();
}
// All nodes that are possible to schedule are stored in the Available set.
LLVM_DEBUG(dbgs() << "** Available: "; Available.dump(*HazardRec););
Candidate Best;
for (auto *SU : Available) {
// SU is the next candidate to be compared against current Best.
Candidate c(SU, *HazardRec);
// Remeber which SU is the best candidate.
if (Best.SU == nullptr || c < Best) {
Best = c;
LLVM_DEBUG(dbgs() << "** Best so far: ";);
} else
LLVM_DEBUG(dbgs() << "** Tried : ";);
LLVM_DEBUG(HazardRec->dumpSU(c.SU, dbgs()); c.dumpCosts();
dbgs() << " Height:" << c.SU->getHeight(); dbgs() << "\n";);
// Once we know we have seen all SUs that affect grouping or use unbuffered
// resources, we can stop iterating if Best looks good.
if (!SU->isScheduleHigh && Best.noCost())
break;
}
assert (Best.SU != nullptr);
return Best.SU;
}
SystemZPostRASchedStrategy::Candidate::
Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec) : Candidate() {
SU = SU_;
// Check the grouping cost. For a node that must begin / end a
// group, it is positive if it would do so prematurely, or negative
// if it would fit naturally into the schedule.
GroupingCost = HazardRec.groupingCost(SU);
// Check the resources cost for this SU.
ResourcesCost = HazardRec.resourcesCost(SU);
}
bool SystemZPostRASchedStrategy::Candidate::
operator<(const Candidate &other) {
// Check decoder grouping.
if (GroupingCost < other.GroupingCost)
return true;
if (GroupingCost > other.GroupingCost)
return false;
// Compare the use of resources.
if (ResourcesCost < other.ResourcesCost)
return true;
if (ResourcesCost > other.ResourcesCost)
return false;
// Higher SU is otherwise generally better.
if (SU->getHeight() > other.SU->getHeight())
return true;
if (SU->getHeight() < other.SU->getHeight())
return false;
// If all same, fall back to original order.
if (SU->NodeNum < other.SU->NodeNum)
return true;
return false;
}
void SystemZPostRASchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
LLVM_DEBUG(dbgs() << "** Scheduling SU(" << SU->NodeNum << ") ";
if (Available.size() == 1) dbgs() << "(only one) ";
Candidate c(SU, *HazardRec); c.dumpCosts(); dbgs() << "\n";);
// Remove SU from Available set and update HazardRec.
Available.erase(SU);
HazardRec->EmitInstruction(SU);
}
void SystemZPostRASchedStrategy::releaseTopNode(SUnit *SU) {
// Set isScheduleHigh flag on all SUs that we want to consider first in
// pickNode().
const MCSchedClassDesc *SC = HazardRec->getSchedClass(SU);
bool AffectsGrouping = (SC->isValid() && (SC->BeginGroup || SC->EndGroup));
SU->isScheduleHigh = (AffectsGrouping || SU->isUnbuffered);
// Put all released SUs in the Available set.
Available.insert(SU);
}