//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // \file This file defines a set of schedule DAG mutations that can be used to // override default scheduler behavior to enforce specific scheduling patterns. // They should be used in cases where runtime performance considerations such as // inter-wavefront interactions, mean that compile-time heuristics cannot // predict the optimal instruction ordering, or in kernels where optimum // instruction scheduling is important enough to warrant manual intervention. // //===----------------------------------------------------------------------===// #include "AMDGPUIGroupLP.h" #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/TargetOpcodes.h" using namespace llvm; #define DEBUG_TYPE "machine-scheduler" namespace { static cl::opt EnableIGroupLP("amdgpu-igrouplp", cl::desc("Enable construction of Instruction Groups and " "their ordering for scheduling"), cl::init(false)); static cl::opt> VMEMGroupMaxSize("amdgpu-igrouplp-vmem-group-size", cl::init(None), cl::Hidden, cl::desc("The maximum number of instructions to include " "in VMEM group.")); static cl::opt> MFMAGroupMaxSize("amdgpu-igrouplp-mfma-group-size", cl::init(None), cl::Hidden, cl::desc("The maximum number of instructions to include " "in MFMA group.")); static cl::opt> LDRGroupMaxSize("amdgpu-igrouplp-ldr-group-size", cl::init(None), cl::Hidden, cl::desc("The maximum number of instructions to include " "in lds/gds read group.")); static cl::opt> LDWGroupMaxSize("amdgpu-igrouplp-ldw-group-size", cl::init(None), cl::Hidden, cl::desc("The maximum number of instructions to include " "in lds/gds write group.")); // Components of the mask that determines which instruction types may be may be // classified into a SchedGroup. enum class SchedGroupMask { NONE = 0u, ALU = 1u << 0, VALU = 1u << 1, SALU = 1u << 2, MFMA = 1u << 3, VMEM = 1u << 4, VMEM_READ = 1u << 5, VMEM_WRITE = 1u << 6, DS = 1u << 7, DS_READ = 1u << 8, DS_WRITE = 1u << 9, ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | DS_READ | DS_WRITE, LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) }; // Classify instructions into groups to enable fine tuned control over the // scheduler. These groups may be more specific than current SchedModel // instruction classes. class SchedGroup { private: // Mask that defines which instruction types can be classified into this // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER // and SCHED_GROUP_BARRIER. SchedGroupMask SGMask; // Maximum number of SUnits that can be added to this group. Optional MaxSize; // SchedGroups will only synchronize with other SchedGroups that have the same // SyncID. int SyncID = 0; // Collection of SUnits that are classified as members of this group. SmallVector Collection; ScheduleDAGInstrs *DAG; const SIInstrInfo *TII; // Try to add and edge from SU A to SU B. bool tryAddEdge(SUnit *A, SUnit *B); // Use SGMask to determine whether we can classify MI as a member of this // SchedGroup object. bool canAddMI(const MachineInstr &MI) const; // Returns true if SU can be added to this SchedGroup. bool canAddSU(SUnit &SU) const; // Returns true if no more instructions may be added to this group. bool isFull() const; // Add SU to the SchedGroup. void add(SUnit &SU) { LLVM_DEBUG(dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << " adding " << *SU.getInstr()); Collection.push_back(&SU); } public: // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If // MakePred is true, SU will be a predecessor of the SUnits in this // SchedGroup, otherwise SU will be a successor. void link(SUnit &SU, bool MakePred = false); // Add DAG dependencies from all SUnits in this SchedGroup and this SU. Use // the predicate to determine whether SU should be a predecessor (P = true) // or a successor (P = false) of this SchedGroup. void link(SUnit &SU, function_ref P); // Add DAG dependencies such that SUnits in this group shall be ordered // before SUnits in OtherGroup. void link(SchedGroup &OtherGroup); // Identify and add all relevant SUs from the DAG to this SchedGroup. void initSchedGroup(); // Add instructions to the SchedGroup bottom up starting from RIter. // ConflictedInstrs is a set of instructions that should not be added to the // SchedGroup even when the other conditions for adding it are satisfied. // RIter will be added to the SchedGroup as well, and dependencies will be // added so that RIter will always be scheduled at the end of the group. void initSchedGroup(std::vector::reverse_iterator RIter, DenseSet &ConflictedInstrs); int getSyncID() { return SyncID; } SchedGroup(SchedGroupMask SGMask, Optional MaxSize, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {} SchedGroup(SchedGroupMask SGMask, Optional MaxSize, int SyncID, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {} }; class IGroupLPDAGMutation : public ScheduleDAGMutation { public: const SIInstrInfo *TII; ScheduleDAGMI *DAG; IGroupLPDAGMutation() = default; void apply(ScheduleDAGInstrs *DAGInstrs) override; }; // DAG mutation that coordinates with the SCHED_BARRIER instruction and // corresponding builtin. The mutation adds edges from specific instruction // classes determined by the SCHED_BARRIER mask so that they cannot be class SchedBarrierDAGMutation : public ScheduleDAGMutation { private: const SIInstrInfo *TII; ScheduleDAGMI *DAG; // Organize lists of SchedGroups by their SyncID. SchedGroups / // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added // between then. DenseMap> SyncedSchedGroupsMap; // Used to track instructions that are already to added to a different // SchedGroup with the same SyncID. DenseMap> SyncedInstrsMap; // Add DAG edges that enforce SCHED_BARRIER ordering. void addSchedBarrierEdges(SUnit &SU); // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should // not be reordered accross the SCHED_BARRIER. This is used for the base // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that // SCHED_BARRIER will always block all instructions that can be classified // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size // and may only synchronize with some SchedGroups. Returns the inverse of // Mask. SCHED_BARRIER's mask describes which instruction types should be // allowed to be scheduled across it. Invert the mask to get the // SchedGroupMask of instructions that should be barred. SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; // Create SchedGroups for a SCHED_GROUP_BARRIER. void initSchedGroupBarrier(std::vector::reverse_iterator RIter); // Add DAG edges that try to enforce ordering defined by SCHED_GROUP_BARRIER // instructions. void addSchedGroupBarrierEdges(); public: void apply(ScheduleDAGInstrs *DAGInstrs) override; SchedBarrierDAGMutation() = default; }; bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { if (A != B && DAG->canAddEdge(B, A)) { DAG->addEdge(B, SDep(A, SDep::Artificial)); LLVM_DEBUG(dbgs() << "Adding edge...\n" << "from: SU(" << A->NodeNum << ") " << *A->getInstr() << "to: SU(" << B->NodeNum << ") " << *B->getInstr()); return true; } return false; } bool SchedGroup::canAddMI(const MachineInstr &MI) const { bool Result = false; if (MI.isMetaInstruction()) Result = false; else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && (TII->isVALU(MI) || TII->isMFMA(MI) || TII->isSALU(MI))) Result = true; else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && TII->isVALU(MI) && !TII->isMFMA(MI)) Result = true; else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && TII->isSALU(MI)) Result = true; else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && TII->isMFMA(MI)) Result = true; else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) Result = true; else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && MI.mayLoad() && (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) Result = true; else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && MI.mayStore() && (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) Result = true; else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && TII->isDS(MI)) Result = true; else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && MI.mayLoad() && TII->isDS(MI)) Result = true; else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && MI.mayStore() && TII->isDS(MI)) Result = true; LLVM_DEBUG( dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << (Result ? " could classify " : " unable to classify ") << MI); return Result; } void SchedGroup::link(SUnit &SU, bool MakePred) { for (auto A : Collection) { SUnit *B = &SU; if (MakePred) std::swap(A, B); tryAddEdge(A, B); } } void SchedGroup::link(SUnit &SU, function_ref P) { for (auto A : Collection) { SUnit *B = &SU; if (P(A, B)) std::swap(A, B); tryAddEdge(A, B); } } void SchedGroup::link(SchedGroup &OtherGroup) { for (auto B : OtherGroup.Collection) link(*B); } bool SchedGroup::isFull() const { return MaxSize && Collection.size() >= *MaxSize; } bool SchedGroup::canAddSU(SUnit &SU) const { MachineInstr &MI = *SU.getInstr(); if (MI.getOpcode() != TargetOpcode::BUNDLE) return canAddMI(MI); // Special case for bundled MIs. const MachineBasicBlock *MBB = MI.getParent(); MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; while (E != MBB->end() && E->isBundledWithPred()) ++E; // Return true if all of the bundled MIs can be added to this group. return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); } void SchedGroup::initSchedGroup() { for (auto &SU : DAG->SUnits) { if (isFull()) break; if (canAddSU(SU)) add(SU); } } static bool canFitIntoPipeline(SUnit &SU, ScheduleDAGInstrs *DAG, DenseSet &ConflictedInstrs) { return llvm::all_of(ConflictedInstrs, [DAG, &SU](SUnit *SuccSU) { return DAG->canAddEdge(SuccSU, &SU); }); } void SchedGroup::initSchedGroup(std::vector::reverse_iterator RIter, DenseSet &ConflictedInstrs) { SUnit &InitSU = *RIter; for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { auto &SU = *RIter; if (isFull()) break; if (canAddSU(SU) && !ConflictedInstrs.count(&SU) && canFitIntoPipeline(SU, DAG, ConflictedInstrs)) { add(SU); ConflictedInstrs.insert(&SU); tryAddEdge(&SU, &InitSU); } } add(InitSU); assert(MaxSize); (*MaxSize)++; } // Create a pipeline from the SchedGroups in PipelineOrderGroups such that we // try to enforce the relative ordering of instructions in each group. static void makePipeline(SmallVectorImpl &PipelineOrderGroups) { auto I = PipelineOrderGroups.begin(); auto E = PipelineOrderGroups.end(); for (; I != E; ++I) { auto &GroupA = *I; for (auto J = std::next(I); J != E; ++J) { auto &GroupB = *J; GroupA.link(GroupB); } } } // Same as makePipeline but with reverse ordering. static void makeReversePipeline(SmallVectorImpl &PipelineOrderGroups) { auto I = PipelineOrderGroups.rbegin(); auto E = PipelineOrderGroups.rend(); for (; I != E; ++I) { auto &GroupA = *I; for (auto J = std::next(I); J != E; ++J) { auto &GroupB = *J; GroupA.link(GroupB); } } } void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); TII = ST.getInstrInfo(); DAG = static_cast(DAGInstrs); const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); if (!TSchedModel || DAG->SUnits.empty()) return; LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); // The order of InstructionGroups in this vector defines the // order in which edges will be added. In other words, given the // present ordering, we will try to make each VMEMRead instruction // a predecessor of each DSRead instruction, and so on. SmallVector PipelineOrderGroups = { SchedGroup(SchedGroupMask::VMEM, VMEMGroupMaxSize, DAG, TII), SchedGroup(SchedGroupMask::DS_READ, LDRGroupMaxSize, DAG, TII), SchedGroup(SchedGroupMask::MFMA, MFMAGroupMaxSize, DAG, TII), SchedGroup(SchedGroupMask::DS_WRITE, LDWGroupMaxSize, DAG, TII)}; for (auto &SG : PipelineOrderGroups) SG.initSchedGroup(); makePipeline(PipelineOrderGroups); } // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); while (!SU.Preds.empty()) for (auto &P : SU.Preds) SU.removePred(P); while (!SU.Succs.empty()) for (auto &S : SU.Succs) for (auto &SP : S.getSUnit()->Preds) if (SP.getSUnit() == &SU) S.getSUnit()->removePred(SP); } void SchedBarrierDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); if (!TSchedModel || DAGInstrs->SUnits.empty()) return; LLVM_DEBUG(dbgs() << "Applying SchedBarrierDAGMutation...\n"); const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); TII = ST.getInstrInfo(); DAG = static_cast(DAGInstrs); SyncedInstrsMap.clear(); SyncedSchedGroupsMap.clear(); for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { if (R->getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER) addSchedBarrierEdges(*R); else if (R->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) initSchedGroupBarrier(R); } // SCHED_GROUP_BARRIER edges can only be added after we have found and // initialized all of the SCHED_GROUP_BARRIER SchedGroups. addSchedGroupBarrierEdges(); } void SchedBarrierDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { MachineInstr &MI = *SchedBarrier.getInstr(); assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); // Remove all existing edges from the SCHED_BARRIER that were added due to the // instruction having side effects. resetEdges(SchedBarrier, DAG); auto InvertedMask = invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); SchedGroup SG(InvertedMask, None, DAG, TII); SG.initSchedGroup(); // Preserve original instruction ordering relative to the SCHED_BARRIER. SG.link( SchedBarrier, (function_ref)[]( const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); } SchedGroupMask SchedBarrierDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { // Invert mask and erase bits for types of instructions that are implied to be // allowed past the SCHED_BARRIER. SchedGroupMask InvertedMask = ~Mask; // ALU implies VALU, SALU, MFMA. if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA; // VALU, SALU, MFMA implies ALU. else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::ALU; // VMEM implies VMEM_READ, VMEM_WRITE. if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; // VMEM_READ, VMEM_WRITE implies VMEM. else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::VMEM; // DS implies DS_READ, DS_WRITE. if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; // DS_READ, DS_WRITE implies DS. else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::DS; return InvertedMask; } void SchedBarrierDAGMutation::initSchedGroupBarrier( std::vector::reverse_iterator RIter) { // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due // to the instruction having side effects. resetEdges(*RIter, DAG); MachineInstr &SGB = *RIter->getInstr(); assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); int32_t SGMask = SGB.getOperand(0).getImm(); int32_t Size = SGB.getOperand(1).getImm(); int32_t SyncID = SGB.getOperand(2).getImm(); // Create a new SchedGroup and add it to a list that is mapped to the SyncID. // SchedGroups only enforce ordering between SchedGroups with the same SyncID. auto &SG = SyncedSchedGroupsMap[SyncID].emplace_back((SchedGroupMask)SGMask, Size, SyncID, DAG, TII); // SyncedInstrsMap is used here is used to avoid adding the same SUs in // multiple SchedGroups that have the same SyncID. This only matters for // SCHED_GROUP_BARRIER and not SCHED_BARRIER. SG.initSchedGroup(RIter, SyncedInstrsMap[SG.getSyncID()]); } void SchedBarrierDAGMutation::addSchedGroupBarrierEdges() { // Since we traversed the DAG in reverse order when initializing // SCHED_GROUP_BARRIERs we need to reverse the order in the vector to maintain // user intentions and program order. for (auto &SchedGroups : SyncedSchedGroupsMap) makeReversePipeline(SchedGroups.second); } } // namespace namespace llvm { std::unique_ptr createIGroupLPDAGMutation() { return EnableIGroupLP ? std::make_unique() : nullptr; } std::unique_ptr createSchedBarrierDAGMutation() { return std::make_unique(); } } // end namespace llvm