Reapply "[AMDGPU][Scheduler] Refactor ArchVGPR rematerialization during scheduling (#125885)" (#139548)

This reapplies 067caaa and 382a085 (reverting b35f6e2) with fixes to issues detected by the address sanitizer (MIs have to be removed from live intervals before being removed from their parent MBB). Original commit description below. AMDGPU scheduler's `PreRARematStage` attempts to increase function occupancy w.r.t. ArchVGPR usage by rematerializing trivial ArchVGPR-defining instruction next to their single use. It first collects all eligible trivially rematerializable instructions in the function, then sinks them one-by-one while recomputing occupancy in all affected regions each time to determine if and when it has managed to increase overall occupancy. If it does, changes are committed to the scheduler's state; otherwise modifications to the IR are reverted and the scheduling stage gives up. In both cases, this scheduling stage currently involves repeated queries for up-to-date occupancy estimates and some state copying to enable reversal of sinking decisions when occupancy is revealed not to increase. The current implementation also does not accurately track register pressure changes in all regions affected by sinking decisions. This commit refactors this scheduling stage, improving RP tracking and splitting the stage into two distinct steps to avoid repeated occupancy queries and IR/state rollbacks. - Analysis and collection (`canIncreaseOccupancyOrReduceSpill`). The number of ArchVGPRs to save to reduce spilling or increase function occupancy by 1 (when there is no spilling) is computed. Then, instructions eligible for rematerialization are collected, stopping as soon as enough have been identified to be able to achieve our goal (according to slightly optimistic heuristics). If there aren't enough of such instructions, the scheduling stage stops here. - Rematerialization (`rematerialize`). Instructions collected in the first step are rematerialized one-by-one. Now we are able to directly update the scheduler's state since we have already done the occupancy analysis and know we won't have to rollback any state. Register pressures for impacted regions are recomputed only once, as opposed to at every sinking decision. In the case where the stage attempted to increase occupancy, and if both rematerializations alone and rescheduling after were unable to improve occupancy, then all rematerializations are rollbacked.
2025-05-13 11:11:00 +02:00 · 2025-05-13 11:11:00 +02:00 · 6456ee056f
commit 6456ee056f
parent 3de2fa91e1
13 changed files with 4827 additions and 680 deletions
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@ -23,6 +23,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/RegisterBank.h"
@ -585,6 +586,9 @@ public:
  /// multiple uses.
  bool hasOneNonDBGUser(Register RegNo) const;
  /// If the register has a single non-Debug instruction using the specified
  /// register, returns it; otherwise returns nullptr.
  MachineInstr *getOneNonDBGUser(Register RegNo) const;
  /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
  /// non-debug user instructions.
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@ -432,6 +432,11 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
  return hasSingleElement(use_nodbg_instructions(RegNo));
 }
 MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
  auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
  return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
 }
 bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
                                              unsigned MaxUsers) const {
  return hasNItemsOrLess(use_instr_nodbg_begin(Reg), use_instr_nodbg_end(),
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@ -53,11 +53,20 @@ struct GCNRegPressure {
  /// UnifiedVGPRFile
  unsigned getVGPRNum(bool UnifiedVGPRFile) const {
    if (UnifiedVGPRFile) {
-      return Value[AGPR32] ? alignTo(Value[VGPR32], 4) + Value[AGPR32]
+      return Value[AGPR32] ? getUnifiedVGPRNum(Value[VGPR32], Value[AGPR32])
                           : Value[VGPR32] + Value[AGPR32];
    }
    return std::max(Value[VGPR32], Value[AGPR32]);
  }
  /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
  /// and \p NumAGPRs AGPRS, for a target with a unified VGPR file.
  inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
                                           unsigned NumAGPRs) {
    return alignTo(NumArchVGPRs, AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
           NumAGPRs;
  }
  /// \returns the ArchVGPR32 pressure
  unsigned getArchVGPRNum() const { return Value[VGPR32]; }
  /// \returns the AccVGPR32 pressure
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@ -25,8 +25,13 @@
 #include "GCNSchedStrategy.h"
 #include "AMDGPUIGroupLP.h"
 #include "GCNRegPressure.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
 #define DEBUG_TYPE "machine-scheduler"
@ -890,13 +895,13 @@ GCNScheduleDAGMILive::getRegionLiveInMap() const {
  std::vector<MachineInstr *> RegionFirstMIs;
  RegionFirstMIs.reserve(Regions.size());
  auto I = Regions.rbegin(), E = Regions.rend();
  auto *BB = I->first->getParent();
  do {
    const MachineBasicBlock *MBB = I->first->getParent();
    auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
    RegionFirstMIs.push_back(MI);
    do {
      ++I;
-    } while (I != E && I->first->getParent() == BB);
+    } while (I != E && I->first->getParent() == MBB);
  } while (I != E);
  return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
 }
@ -1081,31 +1086,46 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
  return true;
 }
 /// Allows to easily filter for this stage's debug output.
 #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
 bool PreRARematStage::initGCNSchedStage() {
-  if (!GCNSchedStage::initGCNSchedStage())
+  // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
-    return false;
+  // regions inbetween the defs and region we sinked the def to. Will need to be
-
+  // fixed if there is another pass after this pass.
  if (DAG.RegionsWithMinOcc.none() || DAG.Regions.size() == 1)
    return false;
  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
  // Rematerialization will not help if occupancy is not limited by reg usage.
  if (ST.getOccupancyWithWorkGroupSizes(MF).second == DAG.MinOccupancy)
    return false;
  // FIXME: This pass will invalidate cached MBBLiveIns for regions
  // inbetween the defs and region we sinked the def to. Cached pressure
  // for regions where a def is sinked from will also be invalidated. Will
  // need to be fixed if there is another pass after this pass.
  assert(!S.hasNextStage());
-  collectRematerializableInstructions();
+  if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
-  if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+      DAG.Regions.size() == 1)
    return false;
-  LLVM_DEBUG(
+  // Before performing any IR modification record the parent region of each MI
-      dbgs() << "Retrying function scheduling with improved occupancy of "
+  // and the parent MBB of each region.
-             << DAG.MinOccupancy << " from rematerializing\n");
+  const unsigned NumRegions = DAG.Regions.size();
  RegionBB.reserve(NumRegions);
  for (unsigned I = 0; I < NumRegions; ++I) {
    RegionBoundaries Region = DAG.Regions[I];
    for (auto MI = Region.first; MI != Region.second; ++MI)
      MIRegion.insert({&*MI, I});
    RegionBB.push_back(Region.first->getParent());
  }
  if (!canIncreaseOccupancyOrReduceSpill())
    return false;
  // Rematerialize identified instructions and update scheduler's state.
  rematerialize();
  if (GCNTrackers)
    DAG.RegionLiveOuts.buildLiveRegMap();
  REMAT_DEBUG(
      dbgs() << "Retrying function scheduling with new min. occupancy of "
             << AchievedOcc << " from rematerializing (original was "
             << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
  if (AchievedOcc > DAG.MinOccupancy) {
    DAG.MinOccupancy = AchievedOcc;
    SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
    MFI.increaseOccupancy(MF, DAG.MinOccupancy);
  }
  return true;
 }
@ -1493,8 +1513,7 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
      dbgs()
      << "\n\t      *** In shouldRevertScheduling ***\n"
      << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
-  ScheduleMetrics MBefore =
+  ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
      getScheduleMetrics(DAG.SUnits);
  LLVM_DEBUG(
      dbgs()
      << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
@ -1527,13 +1546,9 @@ bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
 }
 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
-  if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
+  return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
-    return true;
+         mayCauseSpilling(WavesAfter) ||
-
+         (IncreaseOccupancy && WavesAfter < TargetOcc);
  if (mayCauseSpilling(WavesAfter))
    return true;
  return false;
 }
 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
@ -1683,160 +1698,407 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
  return true;
 }
-void PreRARematStage::collectRematerializableInstructions() {
+namespace {
 /// Models excess register pressure in a region and tracks our progress as we
 /// identify rematerialization opportunities.
 struct ExcessRP {
  /// Number of excess ArchVGPRs.
  unsigned ArchVGPRs = 0;
  /// Number of excess AGPRs.
  unsigned AGPRs = 0;
  /// For unified register files, number of excess VGPRs.
  unsigned VGPRs = 0;
  /// For unified register files with AGPR usage, number of excess ArchVGPRs to
  /// save before we are able to save a whole allocation granule.
  unsigned ArchVGPRsToAlignment = 0;
  /// Whether the region uses AGPRs.
  bool HasAGPRs = false;
  /// Whether the subtarget has a unified RF.
  bool UnifiedRF;
  /// Constructs the excess RP model; determines the excess pressure w.r.t. a
  /// maximum number of allowed VGPRs.
  ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
  /// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
  /// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
  /// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
  /// saving these ArchVGPRs helped reduce excess pressure.
  bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
  /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
  /// these ArchVGPRs helped reduce excess pressure.
  bool saveAGPRs(unsigned NumRegs);
  /// Returns whether there is any excess register pressure.
  operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
  friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
    OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
       << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
       << Excess.ArchVGPRsToAlignment << " registers)\n";
    return OS;
  }
 #endif
 private:
  static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) {
    unsigned NumSaved = std::min(LeftToSave, NumRegs);
    NumRegs -= NumSaved;
    LeftToSave -= NumSaved;
    return NumSaved;
  }
 };
 } // namespace
 ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
                   unsigned MaxVGPRs)
    : UnifiedRF(ST.hasGFX90AInsts()) {
  unsigned NumArchVGPRs = RP.getArchVGPRNum();
  unsigned NumAGPRs = RP.getAGPRNum();
  HasAGPRs = NumAGPRs;
  if (!UnifiedRF) {
    // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
    // independently.
    if (NumArchVGPRs > MaxVGPRs)
      ArchVGPRs = NumArchVGPRs - MaxVGPRs;
    if (NumAGPRs > MaxVGPRs)
      AGPRs = NumAGPRs - MaxVGPRs;
    return;
  }
  // Independently of whether overall VGPR pressure is under the limit, we still
  // have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
  // number of addressable registers in each category.
  const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
  if (NumArchVGPRs > MaxArchVGPRs) {
    ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
    NumArchVGPRs = MaxArchVGPRs;
  }
  if (NumAGPRs > MaxArchVGPRs) {
    AGPRs = NumAGPRs - MaxArchVGPRs;
    NumAGPRs = MaxArchVGPRs;
  }
  // Check overall VGPR usage against the limit; any excess above addressable
  // register limits has already been accounted for.
  const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
  unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
  if (NumVGPRs > MaxVGPRs) {
    VGPRs = NumVGPRs - MaxVGPRs;
    ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, Granule);
    if (!ArchVGPRsToAlignment)
      ArchVGPRsToAlignment = Granule;
  }
 }
 bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
  bool Progress = saveRegs(ArchVGPRs, NumRegs);
  if (!NumRegs)
    return Progress;
  if (!UnifiedRF) {
    if (UseArchVGPRForAGPRSpill)
      Progress |= saveRegs(AGPRs, NumRegs);
  } else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) {
    // There is progress as long as there are VGPRs left to save, even if the
    // save induced by this particular call does not cross an ArchVGPR alignment
    // barrier.
    Progress = true;
    // ArchVGPRs can only be allocated as a multiple of a granule in unified RF.
    unsigned NumSavedRegs = 0;
    // Count the number of whole ArchVGPR allocation granules we can save.
    const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
    if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
      NumSavedRegs = NumGranules * Granule;
      NumRegs -= NumSavedRegs;
    }
    // We may be able to save one more whole ArchVGPR allocation granule.
    if (NumRegs >= ArchVGPRsToAlignment) {
      NumSavedRegs += Granule;
      ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
    } else {
      ArchVGPRsToAlignment -= NumRegs;
    }
    // Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
    // spilling and have some free ArchVGPR slots.
    saveRegs(VGPRs, NumSavedRegs);
    if (UseArchVGPRForAGPRSpill)
      saveRegs(AGPRs, NumSavedRegs);
  } else {
    // No AGPR usage in the region i.e., no allocation granule to worry about.
    Progress |= saveRegs(VGPRs, NumRegs);
  }
  return Progress;
 }
 bool ExcessRP::saveAGPRs(unsigned NumRegs) {
  return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
 }
 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
    Register Reg = Register::index2VirtReg(I);
    if (!DAG.LIS->hasInterval(Reg))
      continue;
-    // TODO: Handle AGPR and SGPR rematerialization
+  REMAT_DEBUG({
-    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+    dbgs() << "Collecting rematerializable instructions in ";
-        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
+    MF.getFunction().printAsOperand(dbgs(), false);
-      continue;
+    dbgs() << '\n';
  });
-    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
+  // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
-    MachineInstr *Def = Op->getParent();
+  // occupancy, or regions with VGPR spilling) to a model of their excess RP.
-    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
+  DenseMap<unsigned, ExcessRP> OptRegions;
-      continue;
+  const Function &F = MF.getFunction();
-    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
+  std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
-    if (Def->getParent() == UseI->getParent())
+  const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
-      continue;
+  const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
  const unsigned MaxSGPRsIncOcc =
      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
  IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
-    bool HasRematDependency = false;
+  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
-    // Check if this instruction uses any registers that are planned to be
+    if (Cond) {
-    // rematerialized
+      // We won't try to increase occupancy.
-    for (auto &RematEntry : RematerializableInsts) {
+      IncreaseOccupancy = false;
-      if (find_if(RematEntry.second,
+      OptRegions.clear();
                  [&Def](std::pair<MachineInstr *, MachineInstr *> &Remat) {
                    for (MachineOperand &MO : Def->operands()) {
                      if (!MO.isReg())
                        continue;
                      if (MO.getReg() == Remat.first->getOperand(0).getReg())
                        return true;
    }
    return Cond;
  };
  // Collect optimizable regions. If there is spilling in any region we will
  // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
  // occupancy by one in the whole function.
  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
    GCNRegPressure &RP = DAG.Pressure[I];
    // Check whether SGPR pressures prevents us from eliminating spilling.
    unsigned NumSGPRs = RP.getSGPRNum();
    if (NumSGPRs > MaxSGPRsNoSpill)
      ClearOptRegionsIf(IncreaseOccupancy);
    ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
    if (Excess) {
      ClearOptRegionsIf(IncreaseOccupancy);
    } else if (IncreaseOccupancy) {
      // Check whether SGPR pressure prevents us from increasing occupancy.
      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
        if (DAG.MinOccupancy >= WavesPerEU.first)
          return false;
-                  }) != RematEntry.second.end()) {
+        continue;
-        HasRematDependency = true;
+      }
-        break;
+      if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
        // We can only rematerialize ArchVGPRs at this point.
        unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
        bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
        if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
          if (DAG.MinOccupancy >= WavesPerEU.first)
            return false;
          continue;
        }
      }
-    // Do not rematerialize an instruction if it uses an instruction that we
+    }
-    // have designated for rematerialization.
+    if (Excess)
      OptRegions.insert({I, Excess});
  }
  if (OptRegions.empty())
    return false;
 #ifndef NDEBUG
  if (IncreaseOccupancy)
    REMAT_DEBUG(dbgs() << "Occupancy minimal in regions:\n");
  else
    REMAT_DEBUG(dbgs() << "Spilling in regions:\n");
  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
    if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
      REMAT_DEBUG(dbgs() << "  " << I << ": " << OptIt->getSecond() << '\n');
  }
 #endif
  // When we are reducing spilling, the target is the minimum target number of
  // waves/EU determined by the subtarget.
  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
  // Accounts for a reduction in RP in an optimizable region. Returns whether we
  // estimate that we have identified enough rematerialization opportunities to
  // achieve our goal, and sets Progress to true when this particular reduction
  // in pressure was helpful toward that goal.
  auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
                              bool &Progress) -> bool {
    ExcessRP &Excess = OptIt->getSecond();
    // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
    // only when we are just trying to eliminate spilling to memory. At this
    // point we err on the conservative side and do not increase
    // register-to-register spilling for the sake of increasing occupancy.
    Progress |=
        Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
                             /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
    if (!Excess)
      OptRegions.erase(OptIt->getFirst());
    return OptRegions.empty();
  };
  // We need up-to-date live-out info. to query live-out register masks in
  // regions containing rematerializable instructions.
  DAG.RegionLiveOuts.buildLiveRegMap();
  // Cache set of registers that are going to be rematerialized.
  DenseSet<unsigned> RematRegs;
  // Identify rematerializable instructions in the function.
  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
    auto Region = DAG.Regions[I];
    for (auto MI = Region.first; MI != Region.second; ++MI) {
      // The instruction must be trivially rematerializable.
      MachineInstr &DefMI = *MI;
      if (!isTriviallyReMaterializable(DefMI))
        continue;
      // We only support rematerializing virtual VGPRs with one definition.
      Register Reg = DefMI.getOperand(0).getReg();
      if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
          !DAG.MRI.hasOneDef(Reg))
        continue;
      // We only care to rematerialize the instruction if it has a single
      // non-debug user in a different region. The using MI may not belong to a
      // region if it is a lone region terminator.
      MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
      if (!UseMI)
        continue;
      auto UseRegion = MIRegion.find(UseMI);
      if (UseRegion != MIRegion.end() && UseRegion->second == I)
        continue;
      // Do not rematerialize an instruction if it uses or is used by an
      // instruction that we have designated for rematerialization.
      // FIXME: Allow for rematerialization chains: this requires 1. updating
      // remat points to account for uses that are rematerialized, and 2. either
-    // rematerializing the candidates in careful ordering, or deferring the MBB
+      // rematerializing the candidates in careful ordering, or deferring the
-    // RP walk until the entire chain has been rematerialized.
+      // MBB RP walk until the entire chain has been rematerialized.
-    if (HasRematDependency)
+      if (Rematerializations.contains(UseMI) ||
          llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
            return MO.isReg() && RematRegs.contains(MO.getReg());
          }))
        continue;
-    // Similarly, check if the UseI is planned to be remat.
+      // Do not rematerialize an instruction it it uses registers that aren't
-    for (auto &RematEntry : RematerializableInsts) {
+      // available at its use. This ensures that we are not extending any live
-      if (find_if(RematEntry.second,
+      // range while rematerializing.
-                  [&UseI](std::pair<MachineInstr *, MachineInstr *> &Remat) {
+      SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
-                    return Remat.first == UseI;
+      SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
-                  }) != RematEntry.second.end()) {
+      if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
        HasRematDependency = true;
        break;
      }
    }
    if (HasRematDependency)
      break;
    // We are only collecting defs that are defined in another block and are
    // live-through or used inside regions at MinOccupancy. This means that the
    // register must be in the live-in set for the region.
    bool AddedToRematList = false;
    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
      auto It = DAG.LiveIns[I].find(Reg);
      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
        if (DAG.RegionsWithMinOcc[I]) {
          SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
          SlotIndex UseIdx =
              DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
          if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
            RematerializableInsts[I][Def] = UseI;
            AddedToRematList = true;
          }
        }
        // Collect regions with rematerializable reg as live-in to avoid
        // searching later when updating RP.
        RematDefToLiveInRegions[Def].push_back(I);
      }
    }
    if (!AddedToRematList)
      RematDefToLiveInRegions.erase(Def);
  }
 }
 bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
                                              const TargetInstrInfo *TII) {
  // Temporary copies of cached variables we will be modifying and replacing if
  // sinking succeeds.
  SmallVector<
      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
      NewRegions;
  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
  DenseMap<unsigned, GCNRegPressure> NewPressure;
  BitVector NewRescheduleRegions;
  LiveIntervals *LIS = DAG.LIS;
  NewRegions.resize(DAG.Regions.size());
  NewRescheduleRegions.resize(DAG.Regions.size());
  // Collect only regions that has a rematerializable def as a live-in.
  SmallSet<unsigned, 16> ImpactedRegions;
  for (const auto &It : RematDefToLiveInRegions)
    ImpactedRegions.insert_range(It.second);
  // Make copies of register pressure and live-ins cache that will be updated
  // as we rematerialize.
  for (auto Idx : ImpactedRegions) {
    NewPressure[Idx] = DAG.Pressure[Idx];
    NewLiveIns[Idx] = DAG.LiveIns[Idx];
  }
  NewRegions = DAG.Regions;
  NewRescheduleRegions.reset();
  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
  bool Improved = false;
  for (auto I : ImpactedRegions) {
    if (!DAG.RegionsWithMinOcc[I])
        continue;
-    Improved = false;
+      REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
-    int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
+      RematInstruction &Remat =
-    int SGPRUsage = NewPressure[I].getSGPRNum();
+          Rematerializations.try_emplace(&DefMI, UseMI).first->second;
-    // TODO: Handle occupancy drop due to AGPR and SGPR.
+      bool RematUseful = false;
-    // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
+      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
-    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
+        // Optimistically consider that moving the instruction out of its
-      break;
+        // defining region will reduce RP in the latter; this assumes that
-
+        // maximum RP in the region is reached somewhere between the defining
-    // The occupancy of this region could have been improved by a previous
+        // instruction and the end of the region.
-    // iteration's sinking of defs.
+        REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
-    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
+        LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
-      NewRescheduleRegions[I] = true;
+        if (ReduceRPInRegion(It, Mask, RematUseful))
-      Improved = true;
+          return true;
      continue;
      }
-    // First check if we have enough trivially rematerializable instructions to
+      for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
-    // improve occupancy. Optimistically assume all instructions we are able to
+        // We are only collecting regions in which the register is a live-in
-    // sink decreased RP.
+        // (and may be live-through).
-    int TotalSinkableRegs = 0;
+        auto It = DAG.LiveIns[LIRegion].find(Reg);
-    for (const auto &It : RematerializableInsts[I]) {
+        if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
-      MachineInstr *Def = It.first;
+          continue;
-      Register DefReg = Def->getOperand(0).getReg();
+        Remat.LiveInRegions.insert(LIRegion);
-      TotalSinkableRegs +=
+
-          SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
+        // Account for the reduction in RP due to the rematerialization in an
        // optimizable region in which the defined register is a live-in. This
        // is exact for live-through region but optimistic in the using region,
        // where RP is actually reduced only if maximum RP is reached somewhere
        // between the beginning of the region and the rematerializable
        // instruction's use.
        if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
          REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
          if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
            return true;
        }
      }
      // If the instruction is not a live-in or live-out in any optimizable
      // region then there is no point in rematerializing it.
      if (!RematUseful) {
        Rematerializations.pop_back();
        REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
      } else {
        RematRegs.insert(Reg);
      }
    }
  }
  if (IncreaseOccupancy) {
    // We were trying to increase occupancy but failed, abort the stage.
    REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
    Rematerializations.clear();
    return false;
  }
  REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
  return !Rematerializations.empty();
 }
 void PreRARematStage::rematerialize() {
  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
  // Collect regions whose RP changes in unpredictable way; we will have to
  // fully recompute their RP after all rematerailizations.
  DenseSet<unsigned> RecomputeRP;
  // Rematerialize all instructions.
  for (auto &[DefMI, Remat] : Rematerializations) {
    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
    Register Reg = DefMI->getOperand(0).getReg();
    unsigned SubReg = DefMI->getOperand(0).getSubReg();
    unsigned DefRegion = MIRegion.at(DefMI);
    // Rematerialize DefMI to its use block.
    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
                       *DAG.TRI);
    Remat.RematMI = &*std::prev(InsertPos);
    Remat.RematMI->getOperand(0).setSubReg(SubReg);
    DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
    // Update region boundaries in regions we sinked from (remove defining MI)
    // and to (insert MI rematerialized in use block). Only then we can erase
    // the original MI.
    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
    auto UseRegion = MIRegion.find(Remat.UseMI);
    if (UseRegion != MIRegion.end()) {
      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
                                 Remat.RematMI);
    }
    DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
    DefMI->eraseFromParent();
    // Collect all regions impacted by the rematerialization and update their
    // live-in/RP information.
    for (unsigned I : Remat.LiveInRegions) {
      ImpactedRegions.insert({I, DAG.Pressure[I]});
      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
 #ifdef EXPENSIVE_CHECKS
      // All uses are known to be available / live at the remat point. Thus, the
      // uses should already be live in to the region.
-      for (MachineOperand &MO : Def->operands()) {
+      for (MachineOperand &MO : DefMI->operands()) {
        if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
          continue;
@ -1844,13 +2106,12 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
        if (!UseReg.isVirtual())
          continue;
-        LiveInterval &LI = LIS->getInterval(UseReg);
+        LiveInterval &LI = DAG.LIS->getInterval(UseReg);
        LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
        if (LI.hasSubRanges() && MO.getSubReg())
          LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
-        assert(NewLiveIns[I].contains(UseReg));
+        LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
        LaneBitmask LiveInMask = NewLiveIns[I][UseReg];
        LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
        // If this register has lanes not covered by the LiveIns, be sure they
        // do not map to any subrange. ref:
@ -1862,126 +2123,64 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
        }
      }
 #endif
      // The register is no longer a live-in in all regions but the one that
      // contains the single use. In live-through regions, maximum register
      // pressure decreases predictably so we can directly update it. In the
      // using region, maximum RP may or may not decrease, so we will mark it
      // for re-computation after all materializations have taken place.
      LaneBitmask PrevMask = RegionLiveIns[Reg];
      RegionLiveIns.erase(Reg);
      RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
      if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
        DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
      else
        RecomputeRP.insert(I);
    }
-    int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
+    // RP in the region from which the instruction was rematerialized may or may
-    unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
+    // not decrease.
-    // If in the most optimistic scenario, we cannot improve occupancy, then do
+    ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
-    // not attempt to sink any instructions.
+    RecomputeRP.insert(DefRegion);
    if (OptimisticOccupancy <= DAG.MinOccupancy)
      break;
-    unsigned ImproveOccupancy = 0;
+    // Recompute live interval to reflect the register's rematerialization.
-    SmallVector<MachineInstr *, 4> SinkedDefs;
+    Register RematReg = Remat.RematMI->getOperand(0).getReg();
-    for (auto &It : RematerializableInsts[I]) {
+    DAG.LIS->removeInterval(RematReg);
-      MachineInstr *Def = It.first;
+    DAG.LIS->createAndComputeVirtRegInterval(RematReg);
-      MachineBasicBlock::iterator InsertPos =
+  }
          MachineBasicBlock::iterator(It.second);
      Register Reg = Def->getOperand(0).getReg();
      // Rematerialize MI to its use block.
      TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
                         Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
      MachineInstr *NewMI = &*std::prev(InsertPos);
      LIS->InsertMachineInstrInMaps(*NewMI);
      LIS->removeInterval(Reg);
      LIS->createAndComputeVirtRegInterval(Reg);
      InsertedMIToOldDef[NewMI] = Def;
-      // Update region boundaries in scheduling region we sinked from since we
+  // All regions impacted by at least one rematerialization must be rescheduled.
-      // may sink an instruction that was at the beginning or end of its region
+  // Maximum pressure must also be recomputed for all regions where it changed
-      DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
+  // non-predictably and checked against the target occupancy.
-                                 /*Removing =*/true);
+  AchievedOcc = TargetOcc;
  for (auto &[I, OriginalRP] : ImpactedRegions) {
    bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
    DAG.RescheduleRegions[I] = !IsEmptyRegion;
    if (!RecomputeRP.contains(I))
      continue;
-      // Update region boundaries in region we sinked to.
+    GCNRegPressure RP;
-      DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
+    if (IsEmptyRegion) {
-
+      RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
      LaneBitmask PrevMask = NewLiveIns[I][Reg];
      // FIXME: Also update cached pressure for where the def was sinked from.
      // Update RP for all regions that has this reg as a live-in and remove
      // the reg from all regions as a live-in.
      for (auto Idx : RematDefToLiveInRegions[Def]) {
        NewLiveIns[Idx].erase(Reg);
        if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
          // Def is live-through and not used in this block.
          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
    } else {
-          // Def is used and rematerialized into this block.
+      GCNDownwardRPTracker RPT(*DAG.LIS);
-          GCNDownwardRPTracker RPT(*LIS);
+      auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
-          auto *NonDbgMI = &*skipDebugInstructionsForward(
+                                                      DAG.Regions[I].second);
-              NewRegions[Idx].first, NewRegions[Idx].second);
+      if (NonDbgMI == DAG.Regions[I].second) {
-          RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
+        // Region is non-empty but contains only debug instructions.
-          RPT.advance(NewRegions[Idx].second);
+        RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
-          NewPressure[Idx] = RPT.moveMaxPressure();
+      } else {
        RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
        RPT.advance(DAG.Regions[I].second);
        RP = RPT.moveMaxPressure();
      }
    }
-
+    DAG.Pressure[I] = RP;
-      SinkedDefs.push_back(Def);
+    AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
-      ImproveOccupancy = NewPressure[I].getOccupancy(ST);
+  }
-      if (ImproveOccupancy > DAG.MinOccupancy)
+  REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
        break;
    }
    // Remove defs we just sinked from all regions' list of sinkable defs
    for (auto &Def : SinkedDefs)
      for (auto TrackedIdx : RematDefToLiveInRegions[Def])
        RematerializableInsts[TrackedIdx].erase(Def);
    if (ImproveOccupancy <= DAG.MinOccupancy)
      break;
    NewRescheduleRegions[I] = true;
    Improved = true;
  }
  if (!Improved) {
    // Occupancy was not improved for all regions that were at MinOccupancy.
    // Undo sinking and remove newly rematerialized instructions.
    for (auto &Entry : InsertedMIToOldDef) {
      MachineInstr *MI = Entry.first;
      MachineInstr *OldMI = Entry.second;
      Register Reg = MI->getOperand(0).getReg();
      LIS->RemoveMachineInstrFromMaps(*MI);
      MI->eraseFromParent();
      OldMI->clearRegisterDeads(Reg);
      LIS->removeInterval(Reg);
      LIS->createAndComputeVirtRegInterval(Reg);
    }
    return false;
  }
  // Occupancy was improved for all regions.
  for (auto &Entry : InsertedMIToOldDef) {
    MachineInstr *MI = Entry.first;
    MachineInstr *OldMI = Entry.second;
    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
    DAG.BBLiveInMap.erase(OldMI);
    // Remove OldMI and update LIS
    Register Reg = MI->getOperand(0).getReg();
    LIS->RemoveMachineInstrFromMaps(*OldMI);
    OldMI->eraseFromParent();
    LIS->removeInterval(Reg);
    LIS->createAndComputeVirtRegInterval(Reg);
  }
  // Update live-ins, register pressure, and regions caches.
  for (auto Idx : ImpactedRegions) {
    DAG.LiveIns[Idx] = NewLiveIns[Idx];
    DAG.Pressure[Idx] = NewPressure[Idx];
    DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
  }
  DAG.Regions = NewRegions;
  DAG.RescheduleRegions = NewRescheduleRegions;
  if (GCNTrackers)
    DAG.RegionLiveOuts.buildLiveRegMap();
  SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
  MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
  return true;
 }
 // Copied from MachineLICM
 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
  if (!DAG.TII->isTriviallyReMaterializable(MI))
    return false;
@ -1999,46 +2198,83 @@ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
  return true;
 }
-// When removing, we will have to check both beginning and ending of the region.
+void PreRARematStage::finalizeGCNSchedStage() {
-// When inserting, we will only have to check if we are inserting NewMI in front
+  // We consider that reducing spilling is always beneficial so we never
-// of a scheduling region and do not need to check the ending since we will only
+  // rollback rematerializations in such cases. It's also possible that
-// ever be inserting before an already existing MI.
+  // rescheduling lowers occupancy over the one achieved just through remats, in
  // which case we do not want to rollback either (the rescheduling was already
  // reverted in PreRARematStage::shouldRevertScheduling in such cases).
  unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
  if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
    return;
  REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
  // Rollback the rematerializations.
  for (const auto &[DefMI, Remat] : Rematerializations) {
    MachineInstr &RematMI = *Remat.RematMI;
    unsigned DefRegion = MIRegion.at(DefMI);
    MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
    MachineBasicBlock *MBB = RegionBB[DefRegion];
    Register Reg = RematMI.getOperand(0).getReg();
    unsigned SubReg = RematMI.getOperand(0).getSubReg();
    // Re-rematerialize MI at the end of its original region. Note that it may
    // not be rematerialized exactly in the same position as originally within
    // the region, but it should not matter much.
    TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
    MachineInstr *NewMI = &*std::prev(InsertPos);
    NewMI->getOperand(0).setSubReg(SubReg);
    DAG.LIS->InsertMachineInstrInMaps(*NewMI);
    auto UseRegion = MIRegion.find(Remat.UseMI);
    if (UseRegion != MIRegion.end()) {
      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
                                 nullptr);
    }
    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
    // Erase rematerialized MI.
    DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
    RematMI.eraseFromParent();
    // Recompute live interval for the re-rematerialized register
    DAG.LIS->removeInterval(Reg);
    DAG.LIS->createAndComputeVirtRegInterval(Reg);
    // Re-add the register as a live-in in all regions it used to be one in.
    for (unsigned LIRegion : Remat.LiveInRegions)
      DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
  }
  // Reset RP in all impacted regions.
  for (auto &[I, OriginalRP] : ImpactedRegions)
    DAG.Pressure[I] = OriginalRP;
  GCNSchedStage::finalizeGCNSchedStage();
 }
 void GCNScheduleDAGMILive::updateRegionBoundaries(
-    SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+    RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
-                              MachineBasicBlock::iterator>> &RegionBoundaries,
+    MachineInstr *NewMI) {
-    MachineBasicBlock::iterator MI, MachineInstr *NewMI, bool Removing) {
+  assert(!NewMI ||
-  unsigned I = 0, E = RegionBoundaries.size();
+         NewMI != RegionBounds.second && "cannot remove at region end");
  // Search for first region of the block where MI is located
  while (I != E && MI->getParent() != RegionBoundaries[I].first->getParent())
    ++I;
-  for (; I != E; ++I) {
+  if (RegionBounds.first == RegionBounds.second) {
-    if (MI->getParent() != RegionBoundaries[I].first->getParent())
+    assert(NewMI && "cannot remove from an empty region");
-      return;
+    RegionBounds.first = NewMI;
    if (Removing && MI == RegionBoundaries[I].first &&
        MI == RegionBoundaries[I].second) {
      // MI is in a region with size 1, after removing, the region will be
      // size 0, set RegionBegin and RegionEnd to pass end of block iterator.
      RegionBoundaries[I] =
          std::pair(MI->getParent()->end(), MI->getParent()->end());
    return;
  }
-    if (MI == RegionBoundaries[I].first) {
+
-      if (Removing)
+  // We only care for modifications at the beginning of a non-empty region since
-        RegionBoundaries[I] =
+  // the upper region boundary is exclusive.
-            std::pair(std::next(MI), RegionBoundaries[I].second);
+  if (MI != RegionBounds.first)
    return;
  if (!NewMI)
    RegionBounds.first = std::next(MI); // Removal
  else
-        // Inserted NewMI in front of region, set new RegionBegin to NewMI
+    RegionBounds.first = NewMI; // Insertion
        RegionBoundaries[I] = std::pair(MachineBasicBlock::iterator(NewMI),
                                        RegionBoundaries[I].second);
      return;
    }
    if (Removing && MI == RegionBoundaries[I].second) {
      RegionBoundaries[I] = std::pair(RegionBoundaries[I].first, std::prev(MI));
      return;
    }
  }
 }
 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@ -14,7 +14,9 @@
 #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 #include "GCNRegPressure.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 namespace llvm {
@ -214,6 +216,11 @@ public:
  }
 };
 /// A region's boundaries i.e. a pair of instruction bundle iterators. The lower
 /// boundary is inclusive, the upper boundary is exclusive.
 using RegionBoundaries =
    std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>;
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
  friend class GCNSchedStage;
  friend class OccInitialScheduleStage;
@ -234,8 +241,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
  unsigned MinOccupancy;
  // Vector of regions recorder for later rescheduling
-  SmallVector<std::pair<MachineBasicBlock::iterator,
+  SmallVector<RegionBoundaries, 32> Regions;
                        MachineBasicBlock::iterator>, 32> Regions;
  // Records if a region is not yet scheduled, or schedule has been reverted,
  // or we generally desire to reschedule it.
@ -286,12 +292,13 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
  // Compute and cache live-ins and pressure for all regions in block.
  void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
-  // Update region boundaries when removing MI or inserting NewMI before MI.
+  /// If necessary, updates a region's boundaries following insertion ( \p NewMI
-  void updateRegionBoundaries(
+  /// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
-      SmallVectorImpl<std::pair<MachineBasicBlock::iterator,
+  /// For an MI removal, this must be called before the MI is actually erased
-                                MachineBasicBlock::iterator>> &RegionBoundaries,
+  /// from its parent MBB.
-      MachineBasicBlock::iterator MI, MachineInstr *NewMI,
+  void updateRegionBoundaries(RegionBoundaries &RegionBounds,
-      bool Removing = false);
+                              MachineBasicBlock::iterator MI,
                              MachineInstr *NewMI);
  void runSchedStages();
@ -431,30 +438,73 @@ public:
      : GCNSchedStage(StageID, DAG) {}
 };
 /// Attempts to reduce function spilling or, if there is no spilling, to
 /// increase function occupancy by one with respect to ArchVGPR usage by sinking
 /// trivially rematerializable instructions to their use. When the stage
 /// estimates reducing spilling or increasing occupancy is possible, as few
 /// instructions as possible are rematerialized to reduce potential negative
 /// effects on function latency.
 ///
 /// TODO: We should extend this to work on SGPRs and AGPRs as well.
 class PreRARematStage : public GCNSchedStage {
 private:
-  // Each region at MinOccupancy will have their own list of trivially
+  /// Useful information about a rematerializable instruction.
-  // rematerializable instructions we can remat to reduce RP. The list maps an
+  struct RematInstruction {
-  // instruction to the position we should remat before, usually the MI using
+    /// Single use of the rematerializable instruction's defined register,
-  // the rematerializable instruction.
+    /// located in a different block.
-  MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
+    MachineInstr *UseMI;
-      RematerializableInsts;
+    /// Rematerialized version of \p DefMI, set in
    /// PreRARematStage::rematerialize. Used for reverting rematerializations.
    MachineInstr *RematMI;
    /// Set of regions in which the rematerializable instruction's defined
    /// register is a live-in.
    SmallDenseSet<unsigned, 4> LiveInRegions;
-  // Map a trivially rematerializable def to a list of regions at MinOccupancy
+    RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
-  // that has the defined reg as a live-in.
+  };
  MapVector<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
-  // Collect all trivially rematerializable VGPR instructions with a single def
+  /// Maps all MIs to their parent region. MI terminators are considered to be
-  // and single use outside the defining block into RematerializableInsts.
+  /// outside the region they delimitate, and as such are not stored in the map.
-  void collectRematerializableInstructions();
+  DenseMap<MachineInstr *, unsigned> MIRegion;
  /// Parent MBB to each region, in region order.
  SmallVector<MachineBasicBlock *> RegionBB;
  /// Collects instructions to rematerialize.
  MapVector<MachineInstr *, RematInstruction> Rematerializations;
  /// Collects regions whose live-ins or register pressure will change due to
  /// rematerializations.
  DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
  /// In case we need to rollback rematerializations, save lane masks for all
  /// rematerialized registers in all regions in which they are live-ins.
  DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
  /// Target occupancy the stage estimates is reachable through
  /// rematerialization. Greater than or equal to the pre-stage min occupancy.
  unsigned TargetOcc;
  /// Achieved occupancy *only* through rematerializations (pre-rescheduling).
  /// Smaller than or equal to the target occupancy.
  unsigned AchievedOcc;
  /// Whether the stage is attempting to increase occupancy in the abscence of
  /// spilling.
  bool IncreaseOccupancy;
  /// Returns whether remat can reduce spilling or increase function occupancy
  /// by 1 through rematerialization. If it can do one, collects instructions in
  /// PreRARematStage::Rematerializations and sets the target occupancy in
  /// PreRARematStage::TargetOccupancy.
  bool canIncreaseOccupancyOrReduceSpill();
  /// Whether the MI is trivially rematerializable and does not have any virtual
  /// register use.
  bool isTriviallyReMaterializable(const MachineInstr &MI);
-  // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
+  /// Rematerializes all instructions in PreRARematStage::Rematerializations
-  // Attempt to reduce RP of VGPR by sinking trivially rematerializable
+  /// and stores the achieved occupancy after remat in
-  // instructions. Returns true if we were able to sink instruction(s).
+  /// PreRARematStage::AchievedOcc.
-  bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
+  void rematerialize();
-                               const TargetInstrInfo *TII);
+
  /// If remat alone did not increase occupancy to the target one, rollbacks all
  /// rematerializations and resets live-ins/RP in all regions impacted by the
  /// stage to their pre-stage values.
  void finalizeGCNSchedStage() override;
  /// \p Returns true if all the uses in \p InstToRemat defined at \p
  /// OriginalIdx are live at \p RematIdx. This only checks liveness of virtual
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@ -466,7 +466,7 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
                            getReservedNumSGPRs(MF));
 }
-static unsigned getMaxNumPreloadedSGPRs() {
+unsigned GCNSubtarget::getMaxNumPreloadedSGPRs() const {
  using USI = GCNUserSGPRUsageInfo;
  // Max number of user SGPRs
  const unsigned MaxUserSGPRs =
@ -497,42 +497,28 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
 }
 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
-    const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
+    const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
-  // Compute maximum number of VGPRs function can use using default/requested
+  const auto &[Min, Max] = NumVGPRBounds;
  // minimum number of waves per execution unit.
  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
  // Check if maximum number of VGPRs was explicitly requested using
  // "amdgpu-num-vgpr" attribute.
-  unsigned Requested =
+
-      F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
+  unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
-  if (Requested != MaxNumVGPRs) {
+  if (Requested != Max && hasGFX90AInsts())
    if (hasGFX90AInsts())
    Requested *= 2;
-    // Make sure requested value is compatible with values implied by
+  // Make sure requested value is inside the range of possible VGPR usage.
-    // default/requested minimum/maximum number of waves per execution unit.
+  return std::clamp(Requested, Min, Max);
    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
      Requested = 0;
    if (WavesPerEU.second && Requested &&
        Requested < getMinNumVGPRs(WavesPerEU.second))
      Requested = 0;
    if (Requested)
      MaxNumVGPRs = Requested;
  }
  return MaxNumVGPRs;
 }
 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
-  return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
+  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
  return getBaseMaxNumVGPRs(
      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
 }
 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
-  const Function &F = MF.getFunction();
+  return getMaxNumVGPRs(MF.getFunction());
  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
 }
 void GCNSubtarget::adjustSchedDependency(
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@ -1505,6 +1505,9 @@ public:
  /// \returns Reserved number of SGPRs for given function \p F.
  unsigned getReservedNumSGPRs(const Function &F) const;
  /// \returns Maximum number of preloaded SGPRs for the subtarget.
  unsigned getMaxNumPreloadedSGPRs() const;
  /// \returns max num SGPRs. This is the common utility
  /// function called by MachineFunction and Function
  /// variants of getMaxNumSGPRs.
@ -1573,8 +1576,10 @@ public:
  /// \returns max num VGPRs. This is the common utility function
  /// called by MachineFunction and Function variants of getMaxNumVGPRs.
-  unsigned getBaseMaxNumVGPRs(const Function &F,
+  unsigned
-                              std::pair<unsigned, unsigned> WavesPerEU) const;
+  getBaseMaxNumVGPRs(const Function &F,
                     std::pair<unsigned, unsigned> NumVGPRBounds) const;
  /// \returns Maximum number of VGPRs that meets number of waves per execution
  /// unit requirement for function \p F, or number of VGPRs explicitly
  /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@ -1190,6 +1190,8 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
  return IsWave32 ? 8 : 4;
 }
 unsigned getArchVGPRAllocGranule() { return 4; }
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
    return 512;
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@ -309,6 +309,10 @@ unsigned getVGPREncodingGranule(
    const MCSubtargetInfo *STI,
    std::optional<bool> EnableWavefrontSize32 = std::nullopt);
 /// For subtargets with a unified VGPR file and mixed ArchVGPR/AGPR usage,
 /// returns the allocation granule for ArchVGPRs.
 unsigned getArchVGPRAllocGranule();
 /// \returns Total number of VGPRs for given subtarget \p STI.
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-debug.mir
@ -17,7 +17,7 @@ machineFunctionInfo:
  isEntryFunction: true
 body:             |
  ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_region_1: IsSSA, NoPHIs, TracksLiveness
-  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
+  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
  ; DEBUG-NEXT: ********** MI Scheduling **********
  ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_region_1:%bb.2
  ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@ -89,7 +89,7 @@ machineFunctionInfo:
  isEntryFunction: true
 body:             |
  ; DEBUG: Machine code for function sink_and_inc_idx_when_skipping_small_regions_2: IsSSA, NoPHIs, TracksLiveness
-  ; DEBUG: Retrying function scheduling with improved occupancy of 10 from rematerializing
+  ; DEBUG: [PreRARemat] Retrying function scheduling with new min. occupancy of 10 from rematerializing (original was 9, target was 10)
  ; DEBUG-NEXT: ********** MI Scheduling **********
  ; DEBUG-NEXT: sink_and_inc_idx_when_skipping_small_regions_2:%bb.2
  ; DEBUG-NEXT:   From: %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@ -506,8 +506,8 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, 0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX908-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX908-NEXT:    s_mov_b32 s0, 16
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX908-NEXT:    s_mov_b32 s0, 16
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX908-NEXT:  .LBB2_1: ; %for.cond.preheader
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
@ -566,7 +566,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX90A-LABEL: test_mfma_loop_non_splat:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 1.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
@ -600,6 +599,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX90A-NEXT:  .LBB2_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
@ -626,7 +626,6 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX942-LABEL: test_mfma_loop_non_splat:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 1.0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
@ -660,6 +659,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-NEXT:  .LBB2_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1