[AMDGPU] Replace dynamic VGPR feature with attribute (#133444)

Use a function attribute (amdgpu-dynamic-vgpr) instead of a subtarget feature, as requested in #130030.
2025-06-24 11:09:36 +02:00 · 2025-06-24 11:09:36 +02:00 · a201f8872a
commit a201f8872a
parent 6cfa03f1f1
34 changed files with 1205 additions and 199 deletions
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@ -768,11 +768,6 @@ For example:
                                                  performant than code generated for XNACK replay
                                                  disabled.
     dynamic-vgpr    TODO                         Represents the "Dynamic VGPR" hardware mode, introduced in GFX12.
                                                  Waves launched in this mode may allocate or deallocate the VGPRs
                                                  using dedicated instructions, but may not send the DEALLOC_VGPRS
                                                  message.
     =============== ============================ ==================================================
 .. _amdgpu-target-id:
@ -1764,6 +1759,15 @@ The AMDGPU backend supports the following LLVM IR attributes.
     "amdgpu-promote-alloca-to-vector-vgpr-ratio"     Ratio of VGPRs to budget for promoting alloca to vectors.
     "amdgpu-dynamic-vgpr-block-size"                 Represents the size of a VGPR block in the "Dynamic VGPR" hardware mode,
                                                      introduced in GFX12.
                                                      A value of 0 (default) means that dynamic VGPRs are not enabled.
                                                      Valid values for GFX12+ are 16 and 32.
                                                      Waves launched in this mode may allocate or deallocate the VGPRs
                                                      using dedicated instructions, but may not send the DEALLOC_VGPRS
                                                      message. If a shader has this attribute, then all its callees must
                                                      match its value.
     ================================================ ==========================================================
 Calling Conventions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@ -1281,12 +1281,14 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
   "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
 >;
 // FIXME: Remove after all users are migrated to attribute.
 def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
  "DynamicVGPR",
  "true",
  "Enable dynamic VGPR mode"
 >;
 // FIXME: Remove after all users are migrated to attribute.
 def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
  "DynamicVGPRBlockSize32",
  "true",
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@ -452,15 +452,17 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
      unsigned MaxWaves = MFI.getMaxWavesPerEU();
      uint64_t TotalNumVgpr =
          getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
-      uint64_t NumVGPRsForWavesPerEU = std::max(
+      uint64_t NumVGPRsForWavesPerEU =
-          {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
+          std::max({TotalNumVgpr, (uint64_t)1,
                    (uint64_t)STM.getMinNumVGPRs(
                        MaxWaves, MFI.getDynamicVGPRBlockSize())});
      uint64_t NumSGPRsForWavesPerEU = std::max(
          {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
      const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
          STM.getOccupancyWithWorkGroupSizes(*MF).second,
          MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
-          MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
+          MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
-          OutContext);
+          MFI.getDynamicVGPRBlockSize(), STM, OutContext);
      uint64_t Occupancy;
      const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
@ -1082,7 +1084,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                              Ctx);
  ProgInfo.NumVGPRsForWavesPerEU =
      AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
-                               CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
+                               CreateExpr(STM.getMinNumVGPRs(
                                   MaxWaves, MFI->getDynamicVGPRBlockSize()))},
                              Ctx);
  if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
@ -1256,7 +1259,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
  ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
      STM.computeOccupancy(F, ProgInfo.LDSSize).second,
-      ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+      ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
      MFI->getDynamicVGPRBlockSize(), STM, Ctx);
  const auto [MinWEU, MaxWEU] =
      AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
@ -1405,7 +1409,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
 // Helper function to add common PAL Metadata 3.0+
 static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
                                  const SIProgramInfo &CurrentProgramInfo,
-                                  CallingConv::ID CC, const GCNSubtarget &ST) {
+                                  CallingConv::ID CC, const GCNSubtarget &ST,
                                  unsigned DynamicVGPRBlockSize) {
  if (ST.hasIEEEMode())
    MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
@ -1417,7 +1422,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
                   (bool)CurrentProgramInfo.TrapHandlerEnable);
    MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
-    if (ST.isDynamicVGPREnabled())
+    if (DynamicVGPRBlockSize != 0)
      MD->setComputeRegisters(".dynamic_vgpr_en", true);
  }
@ -1444,7 +1449,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
  // For targets that support dynamic VGPRs, set the number of saved dynamic
  // VGPRs (if any) in the PAL metadata.
  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
-  if (STM.isDynamicVGPREnabled() &&
+  if (MFI->isDynamicVGPREnabled() &&
      MFI->getScratchReservedForDynamicVGPRs() > 0)
    MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
                   MFI->getScratchReservedForDynamicVGPRs() / 4);
@ -1470,7 +1475,8 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
    MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
    MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
                   CurrentProgramInfo.ScratchEnable);
-    EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
+    EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
                          MFI->getDynamicVGPRBlockSize());
  }
  // ScratchSize is in bytes, 16 aligned.
@ -1541,7 +1547,9 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
    MD->setRsrc2(CallingConv::AMDGPU_CS,
                 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
  } else {
-    EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
+    EmitPALMetadataCommon(
        MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
        MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
  }
  // Set optional info
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@ -173,8 +173,16 @@ static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
    return 128;
  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
  unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
  // Temporarily check both the attribute and the subtarget feature, until the
  // latter is removed.
  if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
    DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
  unsigned MaxVGPRs = ST.getMaxNumVGPRs(
-      ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first);
+      ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
      DynamicVGPRBlockSize);
  // A non-entry function has only 32 caller preserved registers.
  // Do not promote alloca which will force spilling unless we know the function
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@ -448,7 +448,10 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
 unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
  // TODO: assert Regions are sorted descending by pressure
  const auto &ST = MF.getSubtarget<GCNSubtarget>();
-  const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+  const unsigned DynamicVGPRBlockSize =
      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
  const auto Occ =
      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
  LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
                    << ", current = " << Occ << '\n');
@ -457,7 +460,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
    // Always build the DAG to add mutations
    BuildDAG DAG(*R, *this);
-    if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
+    if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc)
      continue;
    LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
@ -468,7 +471,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
    LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
               printSchedRP(dbgs(), R->MaxPressure, MaxRP));
-    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
+    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize));
    if (NewOcc <= Occ)
      break;
@ -489,9 +492,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
  const auto &ST = MF.getSubtarget<GCNSubtarget>();
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
  auto TgtOcc = MFI->getMinAllowedOccupancy();
  unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
  sortRegionsByPressure(TgtOcc);
-  auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+  auto Occ =
      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
  bool IsReentry = false;
  if (TryMaximizeOccupancy && Occ < TgtOcc) {
@ -522,19 +527,21 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
      const auto RP = getRegionPressure(*R);
      LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
-      if (RP.getOccupancy(ST) < TgtOcc) {
+      if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
        LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
-        if (R->BestSchedule.get() &&
+        if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
-            R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+                                         ST, DynamicVGPRBlockSize) >= TgtOcc) {
          LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
          scheduleBest(*R);
        } else {
          LLVM_DEBUG(dbgs() << ", restoring\n");
          Ovr.restoreOrder();
-          assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
+          assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >=
                 TgtOcc);
        }
      }
-      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
+      FinalOccupancy =
          std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
    }
  }
  MFI->limitOccupancy(FinalOccupancy);
@ -580,9 +587,11 @@ void GCNIterativeScheduler::scheduleILP(
  const auto &ST = MF.getSubtarget<GCNSubtarget>();
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
  auto TgtOcc = MFI->getMinAllowedOccupancy();
  unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
  sortRegionsByPressure(TgtOcc);
-  auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+  auto Occ =
      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
  bool IsReentry = false;
  if (TryMaximizeOccupancy && Occ < TgtOcc) {
@ -603,17 +612,18 @@ void GCNIterativeScheduler::scheduleILP(
    const auto RP = getSchedulePressure(*R, ILPSchedule);
    LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
-    if (RP.getOccupancy(ST) < TgtOcc) {
+    if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
      LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
-      if (R->BestSchedule.get() &&
+      if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
-        R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+                                       ST, DynamicVGPRBlockSize) >= TgtOcc) {
        LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
        scheduleBest(*R);
      }
    } else {
      scheduleRegion(*R, ILPSchedule, RP);
      LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
-      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
+      FinalOccupancy =
          std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
    }
  }
  MFI->limitOccupancy(FinalOccupancy);
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@ -251,7 +251,9 @@ bool GCNNSAReassignImpl::run(MachineFunction &MF) {
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
-  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
+  MaxNumVGPRs = std::min(
      ST->getMaxNumVGPRs(MFI->getOccupancy(), MFI->getDynamicVGPRBlockSize()),
      MaxNumVGPRs);
  CSRegs = MRI->getCalleeSavedRegs();
  using Candidate = std::pair<const MachineInstr*, bool>;
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@ -13,6 +13,7 @@
 #include "GCNRegPressure.h"
 #include "AMDGPU.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 using namespace llvm;
@ -94,17 +95,20 @@ void GCNRegPressure::inc(unsigned Reg,
 bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                          unsigned MaxOccupancy) const {
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
  unsigned DynamicVGPRBlockSize =
      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
  const auto SGPROcc = std::min(MaxOccupancy,
                                ST.getOccupancyWithNumSGPRs(getSGPRNum()));
-  const auto VGPROcc =
+  const auto VGPROcc = std::min(
-    std::min(MaxOccupancy,
+      MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
-             ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
+                                                DynamicVGPRBlockSize));
  const auto OtherSGPROcc = std::min(MaxOccupancy,
                                ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
  const auto OtherVGPROcc =
      std::min(MaxOccupancy,
-             ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts())));
+               ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()),
                                           DynamicVGPRBlockSize));
  const auto Occ = std::min(SGPROcc, VGPROcc);
  const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@ -226,13 +230,15 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                          O.getVGPRNum(ST.hasGFX90AInsts()));
 }
-Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
+Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
-  return Printable([&RP, ST](raw_ostream &OS) {
+                      unsigned DynamicVGPRBlockSize) {
  return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
    OS << "VGPRs: " << RP.getArchVGPRNum() << ' '
       << "AGPRs: " << RP.getAGPRNum();
    if (ST)
      OS << "(O"
-         << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()))
+         << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()),
                                         DynamicVGPRBlockSize)
         << ')';
    OS << ", SGPRs: " << RP.getSGPRNum();
    if (ST)
@ -240,7 +246,7 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
    OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
       << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
    if (ST)
-      OS << " -> Occ: " << RP.getOccupancy(*ST);
+      OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
    OS << '\n';
  });
 }
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@ -69,9 +69,11 @@ struct GCNRegPressure {
  }
  unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
-  unsigned getOccupancy(const GCNSubtarget &ST) const {
+  unsigned getOccupancy(const GCNSubtarget &ST,
                        unsigned DynamicVGPRBlockSize) const {
    return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
-             ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
+                    ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
                                                DynamicVGPRBlockSize));
  }
  void inc(unsigned Reg,
@ -79,8 +81,10 @@ struct GCNRegPressure {
           LaneBitmask NewMask,
           const MachineRegisterInfo &MRI);
-  bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const {
+  bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O,
-    return getOccupancy(ST) > O.getOccupancy(ST);
+                       unsigned DynamicVGPRBlockSize) const {
    return getOccupancy(ST, DynamicVGPRBlockSize) >
           O.getOccupancy(ST, DynamicVGPRBlockSize);
  }
  /// Compares \p this GCNRegpressure to \p O, returning true if \p this is
@ -133,7 +137,8 @@ private:
  friend GCNRegPressure max(const GCNRegPressure &P1,
                            const GCNRegPressure &P2);
-  friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST);
+  friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST,
                         unsigned DynamicVGPRBlockSize);
 };
 inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
@ -402,7 +407,8 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
 bool isEqual(const GCNRPTracker::LiveRegSet &S1,
             const GCNRPTracker::LiveRegSet &S2);
-Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr);
+Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr,
                unsigned DynamicVGPRBlockSize = 0);
 Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
                const MachineRegisterInfo &MRI);
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@ -99,17 +99,20 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
      std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
  if (!KnownExcessRP) {
-    VGPRCriticalLimit =
+    VGPRCriticalLimit = std::min(
-        std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
+        ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
        VGPRExcessLimit);
  } else {
    // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
    // returns a reasonably small number for targets with lots of VGPRs, such
    // as GFX10 and GFX11.
    LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
                         "VGPRCriticalLimit calculation method.\n");
-
+    unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
-    unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
+    unsigned Granule =
-    unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
+        AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
    unsigned Addressable =
        AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
    unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
    VGPRBudget = std::max(VGPRBudget, Granule);
    VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
@ -1136,7 +1139,8 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
  if (DAG.MinOccupancy > InitialOccupancy) {
    for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
      DAG.RegionsWithMinOcc[IDX] =
-          DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
+          DAG.Pressure[IDX].getOccupancy(
              DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
    LLVM_DEBUG(dbgs() << StageID
                      << " stage successfully increased occupancy to "
@ -1273,11 +1277,14 @@ void GCNSchedStage::checkScheduling() {
  LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
  LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
  unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
  if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
      PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
    DAG.Pressure[RegionIdx] = PressureAfter;
    DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
        DAG.MinOccupancy;
    // Early out if we have achieved the occupancy target.
    LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
@ -1286,10 +1293,10 @@ void GCNSchedStage::checkScheduling() {
  unsigned TargetOccupancy = std::min(
      S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
-  unsigned WavesAfter =
+  unsigned WavesAfter = std::min(
-      std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
+      TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
-  unsigned WavesBefore =
+  unsigned WavesBefore = std::min(
-      std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
+      TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
  LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
                    << ", after " << WavesAfter << ".\n");
@ -1338,7 +1345,8 @@ void GCNSchedStage::checkScheduling() {
  } else {
    DAG.Pressure[RegionIdx] = PressureAfter;
    DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
        DAG.MinOccupancy;
  }
 }
@ -1461,11 +1469,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
    return true;
  // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
-  if (ST.isDynamicVGPREnabled()) {
+  if (DAG.MFI.isDynamicVGPREnabled()) {
    unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
-        &ST, PressureBefore.getVGPRNum(false));
+        &ST, DAG.MFI.getDynamicVGPRBlockSize(),
        PressureBefore.getVGPRNum(false));
    unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
-        &ST, PressureAfter.getVGPRNum(false));
+        &ST, DAG.MFI.getDynamicVGPRBlockSize(),
        PressureAfter.getVGPRNum(false));
    if (BlocksAfter > BlocksBefore)
      return true;
  }
@ -1489,7 +1499,8 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
  // If RP is not reduced in the unclustered reschedule stage, revert to the
  // old schedule.
-  if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
+  if ((WavesAfter <=
           PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
       mayCauseSpilling(WavesAfter)) ||
      GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
    LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
@ -1511,8 +1522,9 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
  ScheduleMetrics MAfter = getScheduleMetrics(DAG);
  unsigned OldMetric = MBefore.getMetric();
  unsigned NewMetric = MAfter.getMetric();
-  unsigned WavesBefore =
+  unsigned WavesBefore = std::min(
-      std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
+      S.getTargetOccupancy(),
      PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
  unsigned Profit =
      ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
       ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
@ -1566,7 +1578,8 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
 void GCNSchedStage::revertScheduling() {
  DAG.RegionsWithMinOcc[RegionIdx] =
-      PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
+      PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) ==
      DAG.MinOccupancy;
  LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
  DAG.RegionEnd = DAG.RegionBegin;
  int SkippedDebugInstr = 0;
@ -1844,13 +1857,16 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
  // occupancy, or regions with VGPR spilling) to a model of their excess RP.
  DenseMap<unsigned, ExcessRP> OptRegions;
  const Function &F = MF.getFunction();
  unsigned DynamicVGPRBlockSize =
      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
  std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
  const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
  const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
  const unsigned MaxSGPRsIncOcc =
      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
-  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
+  const unsigned MaxVGPRsIncOcc =
      ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
  IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
@ -2163,7 +2179,9 @@ void PreRARematStage::rematerialize() {
      }
    }
    DAG.Pressure[I] = RP;
-    AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
+    AchievedOcc = std::min(
        AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
                                             ->getDynamicVGPRBlockSize()));
  }
  REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
 }
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@ -366,8 +366,11 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
                                                   getGeneration());
 }
-unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
+unsigned
-  return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
+GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
                                       unsigned DynamicVGPRBlockSize) const {
  return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs,
                                                       DynamicVGPRBlockSize);
 }
 unsigned
@ -403,9 +406,15 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
 std::pair<unsigned, unsigned>
 GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
                               unsigned NumSGPRs, unsigned NumVGPRs) const {
  unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
  // Temporarily check both the attribute and the subtarget feature until the
  // latter is removed.
  if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
    DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
  auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
  unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
-  unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
+  unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
  // Maximum occupancy may be further limited by high SGPR/VGPR usage.
  MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
@ -512,9 +521,16 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
 }
 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
  // Temporarily check both the attribute and the subtarget feature, until the
  // latter is removed.
  unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
  if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
    DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
  return getBaseMaxNumVGPRs(
-      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
+      F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
          getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
 }
 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@ -1391,7 +1391,8 @@ public:
  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
  /// VGPRs
-  unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+  unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
                                    unsigned DynamicVGPRBlockSize) const;
  /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
  /// be achieved when the only function running on a CU is \p F, each workgroup
@ -1549,8 +1550,8 @@ public:
  unsigned getMaxNumSGPRs(const Function &F) const;
  /// \returns VGPR allocation granularity supported by the subtarget.
-  unsigned getVGPRAllocGranule() const {
+  unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
-    return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
+    return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
  }
  /// \returns VGPR encoding granularity supported by the subtarget.
@ -1570,20 +1571,24 @@ public:
  }
  /// \returns Addressable number of VGPRs supported by the subtarget.
-  unsigned getAddressableNumVGPRs() const {
+  unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
-    return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
+    return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
  }
  /// \returns the minimum number of VGPRs that will prevent achieving more than
  /// the specified number of waves \p WavesPerEU.
-  unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
+  unsigned getMinNumVGPRs(unsigned WavesPerEU,
-    return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
+                          unsigned DynamicVGPRBlockSize) const {
    return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
                                           DynamicVGPRBlockSize);
  }
  /// \returns the maximum number of VGPRs that can be used and still achieved
  /// at least the specified number of waves \p WavesPerEU.
-  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
+  unsigned getMaxNumVGPRs(unsigned WavesPerEU,
-    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
+                          unsigned DynamicVGPRBlockSize) const {
    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
                                           DynamicVGPRBlockSize);
  }
  /// \returns max num VGPRs. This is the common utility function
@ -1686,6 +1691,9 @@ public:
  }
  bool isDynamicVGPREnabled() const { return DynamicVGPR; }
  unsigned getDynamicVGPRBlockSize() const {
    return DynamicVGPRBlockSize32 ? 32 : 16;
  }
  bool requiresDisjointEarlyClobberAndUndef() const override {
    // AMDGPU doesn't care if early-clobber and undef operands are allocated
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@ -313,13 +313,11 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
 /// Remove dependency on GCNSubtarget and depend only only the necessary values
 /// for said occupancy computation. Should match computeOccupancy implementation
 /// without passing \p STM on.
-const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc,
+const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
-                                                  const MCExpr *NumSGPRs,
+    unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
-                                                  const MCExpr *NumVGPRs,
+    unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
                                                  const GCNSubtarget &STM,
                                                  MCContext &Ctx) {
  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
-  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
+  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
  unsigned Generation = STM.getGeneration();
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@ -93,11 +93,10 @@ public:
    return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
  }
-  static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc,
+  static const AMDGPUMCExpr *
-                                             const MCExpr *NumSGPRs,
+  createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
-                                             const MCExpr *NumVGPRs,
+                  const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
-                                             const GCNSubtarget &STM,
+                  const GCNSubtarget &STM, MCContext &Ctx);
                                             MCContext &Ctx);
  ArrayRef<const MCExpr *> getArgs() const { return Args; }
  VariantKind getKind() const { return Kind; }
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@ -197,7 +197,9 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
  // pointer becomes dead and could otherwise be reused for destination.
  RPT.advanceToNext();
  GCNRegPressure MaxPressure = RPT.moveMaxPressure();
-  unsigned Occupancy = MaxPressure.getOccupancy(*ST);
+  unsigned Occupancy = MaxPressure.getOccupancy(
      *ST,
      MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
  // Don't push over half the register budget. We don't want to introduce
  // spilling just to form a soft clause.
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -714,9 +714,10 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
    assert(hasFP(MF));
    Register FPReg = MFI->getFrameOffsetReg();
    assert(FPReg != AMDGPU::FP_REG);
-    unsigned VGPRSize =
+    unsigned VGPRSize = llvm::alignTo(
-        llvm::alignTo((ST.getAddressableNumVGPRs() -
+        (ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) -
-                       AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) *
+         AMDGPU::IsaInfo::getVGPRAllocGranule(&ST,
                                              MFI->getDynamicVGPRBlockSize())) *
            4,
        FrameInfo.getMaxAlign());
    MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
@ -2087,7 +2088,7 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
 bool SIFrameLowering::mayReserveScratchForCWSR(
    const MachineFunction &MF) const {
-  return MF.getSubtarget<GCNSubtarget>().isDynamicVGPREnabled() &&
+  return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
         AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) &&
         AMDGPU::isCompute(MF.getFunction().getCallingConv());
 }
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@ -1760,7 +1760,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
  else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
           MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
    if (!WCG->isOptNone() &&
-        (ST->isDynamicVGPREnabled() ||
+        (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
         (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
          ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
          !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
@ -2652,7 +2652,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
  Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
  Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
-  [[maybe_unused]] unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
+  [[maybe_unused]] unsigned NumVGPRsMax =
      ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
  [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
@ -2821,7 +2822,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
  // (i.e. whether we're in dynamic VGPR mode or not).
  // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
  // waveslot limited kernel runs slower with the deallocation.
-  if (ST->isDynamicVGPREnabled()) {
+  if (MFI->isDynamicVGPREnabled()) {
    for (MachineInstr *MI : ReleaseVGPRInsts) {
      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
              TII->get(AMDGPU::S_ALLOC_VGPR))
@ -2832,7 +2833,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
    if (!ReleaseVGPRInsts.empty() &&
        (MF.getFrameInfo().hasCalls() ||
         ST->getOccupancyWithNumVGPRs(
-             TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
+             TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
             /*IsDynamicVGPR=*/false) <
             AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
      for (MachineInstr *MI : ReleaseVGPRInsts) {
        if (ST->requiresNopBeforeDeallocVGPRs()) {
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -48,6 +48,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
  assert(MaxNumWorkGroups.size() == 3);
  // Temporarily check both the attribute and the subtarget feature, until the
  // latter is completely removed.
  DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
  if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
    DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
  Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
  CallingConv::ID CC = F.getCallingConv();
@ -716,6 +722,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
      PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
      MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
      Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
      DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
      ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
  for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
    SpillPhysVGPRS.push_back(regToString(Reg, TRI));
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@ -299,6 +299,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
  bool HasInitWholeWave = false;
  unsigned DynamicVGPRBlockSize = 0;
  unsigned ScratchReservedForDynamicVGPRs = 0;
  SIMachineFunctionInfo() = default;
@ -352,6 +353,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
    YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
                       StringValue());
    YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
    YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
    YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
                       MFI.ScratchReservedForDynamicVGPRs, 0);
  }
@ -469,6 +471,8 @@ private:
  unsigned NumSpilledSGPRs = 0;
  unsigned NumSpilledVGPRs = 0;
  unsigned DynamicVGPRBlockSize = 0;
  // The size in bytes of the scratch space reserved for the CWSR trap handler
  // to spill some of the dynamic VGPRs.
  unsigned ScratchReservedForDynamicVGPRs = 0;
@ -820,6 +824,9 @@ public:
    BytesInStackArgArea = Bytes;
  }
  bool isDynamicVGPREnabled() const { return DynamicVGPRBlockSize != 0; }
  unsigned getDynamicVGPRBlockSize() const { return DynamicVGPRBlockSize; }
  // This is only used if we need to save any dynamic VGPRs in scratch.
  unsigned getScratchReservedForDynamicVGPRs() const {
    return ScratchReservedForDynamicVGPRs;
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -3748,7 +3748,11 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
  default:
    return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
  case AMDGPU::VGPR_32RegClassID:
-    return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF));
+    return std::min(
        ST.getMaxNumVGPRs(
            MinOcc,
            MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize()),
        ST.getMaxNumVGPRs(MF));
  case AMDGPU::SGPR_32RegClassID:
  case AMDGPU::SGPR_LO16RegClassID:
    return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@ -1158,10 +1158,16 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
 }
 unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
                             unsigned DynamicVGPRBlockSize,
                             std::optional<bool> EnableWavefrontSize32) {
  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
    return 8;
  if (DynamicVGPRBlockSize != 0)
    return DynamicVGPRBlockSize;
  // Temporarily check the subtarget feature, until we fully switch to using
  // attributes.
  if (STI->getFeatureBits().test(FeatureDynamicVGPR))
    return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
@ -1205,20 +1211,26 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
 unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
-unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
                                unsigned DynamicVGPRBlockSize) {
  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
    return 512;
-  if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+
  // Temporarily check the subtarget feature, until we fully switch to using
  // attributes.
  if (DynamicVGPRBlockSize != 0 ||
      STI->getFeatureBits().test(FeatureDynamicVGPR))
    // On GFX12 we can allocate at most 8 blocks of VGPRs.
-    return 8 * getVGPRAllocGranule(STI);
+    return 8 * getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
  return getAddressableNumArchVGPRs(STI);
 }
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
-                                      unsigned NumVGPRs) {
+                                      unsigned NumVGPRs,
-  return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI),
+                                      unsigned DynamicVGPRBlockSize) {
-                                      getMaxWavesPerEU(STI),
+  return getNumWavesPerEUWithNumVGPRs(
-                                      getTotalNumVGPRs(STI));
+      NumVGPRs, getVGPRAllocGranule(STI, DynamicVGPRBlockSize),
      getMaxWavesPerEU(STI), getTotalNumVGPRs(STI));
 }
 unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
@ -1257,7 +1269,8 @@ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
  return 5;
 }
-unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
                        unsigned DynamicVGPRBlockSize) {
  assert(WavesPerEU != 0);
  unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
@ -1265,28 +1278,33 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
    return 0;
  unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
-  unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI);
+  unsigned AddrsableNumVGPRs =
-  unsigned Granule = getVGPRAllocGranule(STI);
+      getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
  unsigned Granule = getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
  unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);
  if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
    return 0;
-  unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs);
+  unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs,
                                                        DynamicVGPRBlockSize);
  if (WavesPerEU < MinWavesPerEU)
-    return getMinNumVGPRs(STI, MinWavesPerEU);
+    return getMinNumVGPRs(STI, MinWavesPerEU, DynamicVGPRBlockSize);
  unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
  unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
  return std::min(MinNumVGPRs, AddrsableNumVGPRs);
 }
-unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
                        unsigned DynamicVGPRBlockSize) {
  assert(WavesPerEU != 0);
  unsigned MaxNumVGPRs =
-      alignDown(getTotalNumVGPRs(STI) / WavesPerEU, getVGPRAllocGranule(STI));
+      alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
-  unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI);
+                getVGPRAllocGranule(STI, DynamicVGPRBlockSize));
  unsigned AddressableNumVGPRs =
      getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
  return std::min(MaxNumVGPRs, AddressableNumVGPRs);
 }
@ -1299,9 +1317,11 @@ unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
 unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI,
                                   unsigned NumVGPRs,
                                   unsigned DynamicVGPRBlockSize,
                                   std::optional<bool> EnableWavefrontSize32) {
  return getGranulatedNumRegisterBlocks(
-      NumVGPRs, getVGPRAllocGranule(STI, EnableWavefrontSize32));
+      NumVGPRs,
      getVGPRAllocGranule(STI, DynamicVGPRBlockSize, EnableWavefrontSize32));
 }
 } // end namespace IsaInfo
@ -2124,6 +2144,16 @@ bool getHasDepthExport(const Function &F) {
  return F.getFnAttributeAsParsedInteger("amdgpu-depth-export", 0) != 0;
 }
 unsigned getDynamicVGPRBlockSize(const Function &F) {
  unsigned BlockSize =
      F.getFnAttributeAsParsedInteger("amdgpu-dynamic-vgpr-block-size", 0);
  if (BlockSize == 16 || BlockSize == 32)
    return BlockSize;
  return 0;
 }
 bool hasXNACK(const MCSubtargetInfo &STI) {
  return STI.hasFeature(AMDGPU::FeatureXNACK);
 }
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@ -298,7 +298,7 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
 /// For subtargets which support it, \p EnableWavefrontSize32 should match
 /// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
 unsigned
-getVGPRAllocGranule(const MCSubtargetInfo *STI,
+getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize,
                    std::optional<bool> EnableWavefrontSize32 = std::nullopt);
 /// \returns VGPR encoding granularity for given subtarget \p STI.
@ -321,20 +321,24 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
 unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);
 /// \returns Addressable number of VGPRs for given subtarget \p STI.
-unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
+unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
                                unsigned DynamicVGPRBlockSize);
 /// \returns Minimum number of VGPRs that meets given number of waves per
 /// execution unit requirement for given subtarget \p STI.
-unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
                        unsigned DynamicVGPRBlockSize);
 /// \returns Maximum number of VGPRs that meets given number of waves per
 /// execution unit requirement for given subtarget \p STI.
-unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
                        unsigned DynamicVGPRBlockSize);
 /// \returns Number of waves reachable for a given \p NumVGPRs usage for given
 /// subtarget \p STI.
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
-                                      unsigned NumVGPRs);
+                                      unsigned NumVGPRs,
                                      unsigned DynamicVGPRBlockSize);
 /// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
 /// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
@ -361,6 +365,7 @@ unsigned getEncodedNumVGPRBlocks(
 /// subtarget \p STI when \p NumVGPRs are used.
 unsigned getAllocatedNumVGPRBlocks(
    const MCSubtargetInfo *STI, unsigned NumVGPRs,
    unsigned DynamicVGPRBlockSize,
    std::optional<bool> EnableWavefrontSize32 = std::nullopt);
 } // end namespace IsaInfo
@ -1305,6 +1310,12 @@ bool getHasColorExport(const Function &F);
 bool getHasDepthExport(const Function &F);
 bool hasDynamicVGPR(const Function &F);
 // Returns the value of the "amdgpu-dynamic-vgpr-block-size" attribute, or 0 if
 // the attribute is missing or its value is invalid.
 unsigned getDynamicVGPRBlockSize(const Function &F);
 LLVM_READNONE
 constexpr bool isShader(CallingConv::ID CC) {
  switch (CC) {
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s
 ; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
@ -137,7 +137,7 @@ define amdgpu_cs void @with_calls_no_inline_const() #0 {
  ret void
 }
-define amdgpu_cs void @with_spills() {
+define amdgpu_cs void @with_spills() #0 {
 ; CHECK-LABEL: with_spills:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
@ -366,6 +366,6 @@ define void @default() #0 {
 declare amdgpu_gfx void @callee(i32) #0
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
-attributes #1 = { nounwind "frame-pointer"="none" }
+attributes #1 = { nounwind "frame-pointer"="none" "amdgpu-dynamic-vgpr-block-size"="16" }
-attributes #2 = { nounwind "frame-pointer"="all" }
+attributes #2 = { nounwind "frame-pointer"="all" "amdgpu-dynamic-vgpr-block-size"="16" }
--- a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
 ; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
@ -68,5 +68,5 @@ define void @default() #0 {
 declare amdgpu_gfx void @callee(i32) #0
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size" = "16" }
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
@ -0,0 +1,305 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
 ; CHECK:           .amdgpu_pal_metadata
 ; CHECK-NEXT: ---
 ; CHECK-NEXT: amdpal.pipelines:
 ; CHECK-NEXT:  - .api:            Vulkan
 ; CHECK-NEXT:    .compute_registers:
 ; CHECK-NEXT:      .dynamic_vgpr_en:   true
 ; CHECK-NEXT:      .tg_size_en:     true
 ; CHECK-NEXT:      .tgid_x_en:      false
 ; CHECK-NEXT:      .tgid_y_en:      false
 ; CHECK-NEXT:      .tgid_z_en:      false
 ; CHECK-NEXT:      .tidig_comp_cnt: 0x1
 ; CHECK-NEXT:    .hardware_stages:
 ; CHECK-NEXT:      .cs:
 ; CHECK-NEXT:        .checksum_value: 0x9444d7d0
 ; CHECK-NEXT:        .debug_mode:     0
 ; CHECK-NEXT:        .excp_en:        0
 ; CHECK-NEXT:        .float_mode:     0xc0
 ; CHECK-NEXT:        .image_op:       false
 ; CHECK-NEXT:        .lds_size:       0x200
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .sgpr_limit:     0x6a
 ; CHECK-NEXT:        .threadgroup_dimensions:
 ; CHECK-NEXT:          - 0x1
 ; CHECK-NEXT:          - 0x400
 ; CHECK-NEXT:          - 0x1
 ; CHECK-NEXT:        .trap_present:   false
 ; CHECK-NEXT:        .user_data_reg_map:
 ; CHECK-NEXT:          - 0x10000000
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:        .user_sgprs:     0x3
 ; CHECK-NEXT:        .vgpr_limit:     0x100
 ; CHECK-NEXT:        .wavefront_size: 0x40
 ; CHECK-NEXT:        .wgp_mode:       true
 ; CHECK:    .registers:      {}
 ; CHECK-NEXT:    .shader_functions:
 ; CHECK-NEXT:      dynamic_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x10
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x22
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
 ; CHECK-NEXT:        .vgpr_count:     0x2
 ; CHECK-NEXT:      dynamic_stack_loop:
 ; CHECK-NEXT:        .backend_stack_size: 0x10
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x22
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
 ; CHECK-NEXT:        .vgpr_count:     0x3
 ; CHECK-NEXT:      multiple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x24
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x24
 ; CHECK-NEXT:        .vgpr_count:     0x3
 ; CHECK-NEXT:      no_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      no_stack_call:
 ; CHECK-NEXT:        .backend_stack_size: 0x10
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x22
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
 ; CHECK-NEXT:        .vgpr_count:     0x3
 ; CHECK-NEXT:      no_stack_extern_call:
 ; CHECK-NEXT:        .backend_stack_size: 0x10
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x24
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
 ; CHECK-NEXT:        .vgpr_count:     0x58
 ; CHECK-NEXT:      no_stack_extern_call_many_args:
 ; CHECK-NEXT:        .backend_stack_size: 0x90
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x24
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x90
 ; CHECK-NEXT:        .vgpr_count:     0x58
 ; CHECK-NEXT:      no_stack_indirect_call:
 ; CHECK-NEXT:        .backend_stack_size: 0x10
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x24
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
 ; CHECK-NEXT:        .vgpr_count:     0x58
 ; CHECK-NEXT:      simple_lds:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0x100
 ; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      simple_lds_recurse:
 ; CHECK-NEXT:        .backend_stack_size: 0x10
 ; CHECK-NEXT:        .lds_size:       0x100
 ; CHECK-NEXT:        .sgpr_count:     0x24
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
 ; CHECK-NEXT:        .vgpr_count:     0x29
 ; CHECK-NEXT:      simple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x14
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x14
 ; CHECK-NEXT:        .vgpr_count:     0x2
 ; CHECK-NEXT:      simple_stack_call:
 ; CHECK-NEXT:        .backend_stack_size: 0x20
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x22
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
 ; CHECK-NEXT:        .vgpr_count:     0x4
 ; CHECK-NEXT:      simple_stack_extern_call:
 ; CHECK-NEXT:        .backend_stack_size: 0x20
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x24
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
 ; CHECK-NEXT:        .vgpr_count:     0x58
 ; CHECK-NEXT:      simple_stack_indirect_call:
 ; CHECK-NEXT:        .backend_stack_size: 0x20
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x24
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
 ; CHECK-NEXT:        .vgpr_count:     0x58
 ; CHECK-NEXT:      simple_stack_recurse:
 ; CHECK-NEXT:        .backend_stack_size: 0x20
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .sgpr_count:     0x24
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
 ; CHECK-NEXT:        .vgpr_count:     0x2a
 ; CHECK:amdpal.version:
 ; CHECK-NEXT:  - 0x3
 ; CHECK-NEXT:  - 0
 ; CHECK-NEXT:...
 ; CHECK-NEXT:        .end_amdgpu_pal_metadata
 declare amdgpu_gfx float @extern_func(float) #0
 declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4
 define amdgpu_gfx float @no_stack(float %arg0) #0 {
  %add = fadd float %arg0, 1.0
  ret float %add
 }
 define amdgpu_gfx float @simple_stack(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %add = fadd float %arg0, %val
  ret float %add
 }
 define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %add = fadd float %arg0, %val
  %stack2 = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack2
  %val2 = load volatile float, ptr addrspace(5) %stack2
  %add2 = fadd float %add, %val2
  ret float %add2
 }
 define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
 bb0:
  %cmp = fcmp ogt float %arg0, 0.0
  br i1 %cmp, label %bb1, label %bb2
 bb1:
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %add = fadd float %arg0, %val
  br label %bb2
 bb2:
  %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
  ret float %res
 }
 define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
 bb0:
  br label %bb1
 bb1:
  %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %add = fadd float %arg0, %val
  %cmp = icmp sgt i32 %ctr, 0
  %newctr = sub i32 %ctr, 1
  br i1 %cmp, label %bb1, label %bb2
 bb2:
  ret float %add
 }
 define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
  %res = call amdgpu_gfx float @simple_stack(float %arg0)
  ret float %res
 }
 define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %res = call amdgpu_gfx float @simple_stack(float %arg0)
  %add = fadd float %res, %val
  ret float %add
 }
 define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
  %res = call amdgpu_gfx float @extern_func(float %arg0)
  ret float %res
 }
 define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %res = call amdgpu_gfx float @extern_func(float %arg0)
  %add = fadd float %res, %val
  ret float %add
 }
 define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
  %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
  ret float %res
 }
 define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
  %fptr = load ptr, ptr addrspace(4) @funcptr
  call amdgpu_gfx void %fptr()
  ret float %arg0
 }
 define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %fptr = load ptr, ptr addrspace(4) @funcptr
  call amdgpu_gfx void %fptr()
  %add = fadd float %arg0, %val
  ret float %add
 }
 define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
  %stack = alloca float, i32 4, align 4, addrspace(5)
  store volatile float 2.0, ptr addrspace(5) %stack
  %val = load volatile float, ptr addrspace(5) %stack
  %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
  %add = fadd float %res, %val
  ret float %add
 }
@lds = internal addrspace(3) global [64 x float] poison
 define amdgpu_gfx float @simple_lds(float %arg0) #0 {
  %val = load float, ptr addrspace(3) @lds
  ret float %val
 }
 define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
  %val = load float, ptr addrspace(3) @lds
  %res = call amdgpu_gfx float @simple_lds_recurse(float %val)
  ret float %res
 }
 attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
 !amdgpu.pal.metadata.msgpack = !{!0}
 !0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
 !1 = !{i32 7}
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
@ -0,0 +1,205 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s
 ; CHECK-LABEL: {{^}}_amdgpu_cs_main:
 ; CHECK: ; TotalNumSgprs: 34
 ; CHECK: ; NumVgprs: 2
 ; CHECK:           .amdgpu_pal_metadata
 ; CHECK-NEXT: ---
 ; CHECK-NEXT: amdpal.pipelines:
 ; CHECK-NEXT:   - .api:            Vulkan
 ; CHECK-NEXT:     .compute_registers:
 ; CHECK-NEXT:       .dynamic_vgpr_en:   true
 ; CHECK-NEXT:       .tg_size_en:     true
 ; CHECK-NEXT:       .tgid_x_en:      false
 ; CHECK-NEXT:       .tgid_y_en:      false
 ; CHECK-NEXT:       .tgid_z_en:      false
 ; CHECK-NEXT:       .tidig_comp_cnt: 0x1
 ; CHECK-NEXT:     .graphics_registers:
 ; CHECK-NEXT:      .ps_extra_lds_size: 0
 ; CHECK-NEXT:      .spi_ps_input_addr:
 ; CHECK-NEXT:        .ancillary_ena:  false
 ; CHECK-NEXT:        .front_face_ena: true
 ; CHECK-NEXT:        .line_stipple_tex_ena: false
 ; CHECK-NEXT:        .linear_center_ena: true
 ; CHECK-NEXT:        .linear_centroid_ena: true
 ; CHECK-NEXT:        .linear_sample_ena: true
 ; CHECK-NEXT:        .persp_center_ena: true
 ; CHECK-NEXT:        .persp_centroid_ena: true
 ; CHECK-NEXT:        .persp_pull_model_ena: false
 ; CHECK-NEXT:        .persp_sample_ena: true
 ; CHECK-NEXT:        .pos_fixed_pt_ena: true
 ; CHECK-NEXT:        .pos_w_float_ena: false
 ; CHECK-NEXT:        .pos_x_float_ena: false
 ; CHECK-NEXT:        .pos_y_float_ena: false
 ; CHECK-NEXT:        .pos_z_float_ena: false
 ; CHECK-NEXT:        .sample_coverage_ena: false
 ; CHECK-NEXT:      .spi_ps_input_ena:
 ; CHECK-NEXT:        .ancillary_ena:  false
 ; CHECK-NEXT:        .front_face_ena: false
 ; CHECK-NEXT:        .line_stipple_tex_ena: false
 ; CHECK-NEXT:        .linear_center_ena: false
 ; CHECK-NEXT:        .linear_centroid_ena: false
 ; CHECK-NEXT:        .linear_sample_ena: false
 ; CHECK-NEXT:        .persp_center_ena: false
 ; CHECK-NEXT:        .persp_centroid_ena: false
 ; CHECK-NEXT:        .persp_pull_model_ena: false
 ; CHECK-NEXT:        .persp_sample_ena: true
 ; CHECK-NEXT:        .pos_fixed_pt_ena: false
 ; CHECK-NEXT:        .pos_w_float_ena: false
 ; CHECK-NEXT:        .pos_x_float_ena: false
 ; CHECK-NEXT:        .pos_y_float_ena: false
 ; CHECK-NEXT:        .pos_z_float_ena: false
 ; CHECK-NEXT:        .sample_coverage_ena: false
 ; CHECK-NEXT:    .hardware_stages:
 ; CHECK-NEXT:      .cs:
 ; CHECK-NEXT:        .checksum_value: 0x9444d7d0
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .dynamic_vgpr_saved_count: 0x70
 ; CHECK-NEXT:        .entry_point:    _amdgpu_cs
 ; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
 ; CHECK-NEXT:        .excp_en:        0
 ; CHECK-NEXT:        .float_mode:     0xc0
 ; CHECK-NEXT:        .image_op:       false
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
 ; CHECK-NEXT:        .scratch_memory_size: 0
 ; CHECK-NEXT:        .sgpr_count:     0x22
 ; CHECK-NEXT:        .sgpr_limit:     0x6a
 ; CHECK-NEXT:        .threadgroup_dimensions:
 ; CHECK-NEXT:          - 0x1
 ; CHECK-NEXT:          - 0x400
 ; CHECK-NEXT:          - 0x1
 ; CHECK-NEXT:        .trap_present:   false
 ; CHECK-NEXT:        .user_data_reg_map:
 ; CHECK-NEXT:          - 0x10000000
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:          - 0xffffffff
 ; CHECK-NEXT:        .user_sgprs:     0x3
 ; CHECK-NEXT:        .vgpr_count:     0x2
 ; CHECK-NEXT:        .vgpr_limit:     0x100
 ; CHECK-NEXT:        .wavefront_size: 0x40
 ; CHECK-NEXT:        .wgp_mode:       false
 ; CHECK-NEXT:      .gs:
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_gs
 ; CHECK-NEXT:        .entry_point_symbol:    gs_shader
 ; CHECK-NEXT:        .lds_size:       0x200
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
 ; CHECK-NEXT:        .scratch_memory_size: 0
 ; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:        .wgp_mode:       true
 ; CHECK-NEXT:      .hs:
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_hs
 ; CHECK-NEXT:        .entry_point_symbol:    hs_shader
 ; CHECK-NEXT:        .lds_size:       0x1000
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
 ; CHECK-NEXT:        .scratch_memory_size: 0
 ; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:        .wgp_mode:       true
 ; CHECK-NEXT:      .ps:
 ; CHECK-NEXT:        .debug_mode:     false
 ; CHECK-NEXT:        .entry_point:    _amdgpu_ps
 ; CHECK-NEXT:        .entry_point_symbol:    ps_shader
 ; CHECK-NEXT:        .lds_size:       0
 ; CHECK-NEXT:        .mem_ordered:    true
 ; CHECK-NEXT:        .scratch_en:     false
 ; CHECK-NEXT:        .scratch_memory_size: 0
 ; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:        .wgp_mode:       true
 ; CHECK:    .registers:      {}
 ; CHECK:amdpal.version:
 ; CHECK-NEXT:  - 0x3
 ; CHECK-NEXT:  - 0
 ; CHECK-NEXT:...
 ; CHECK-NEXT:        .end_amdgpu_pal_metadata
 define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
 .entry:
  %i = call i64 @llvm.amdgcn.s.getpc()
  %i1 = and i64 %i, -4294967296
  %i2 = zext i32 %arg1 to i64
  %i3 = or i64 %i1, %i2
  %i4 = inttoptr i64 %i3 to ptr addrspace(4)
  %i5 = and i32 %arg2, 1023
  %i6 = lshr i32 %arg2, 10
  %i7 = and i32 %i6, 1023
  %i8 = add nuw nsw i32 %i7, %i5
  %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
  %.idx = shl nuw nsw i32 %i8, 2
  call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
  ret void
 }
 define dllexport amdgpu_ps void @ps_shader() #1 {
  ret void
 }
@LDS.GS = external addrspace(3) global [1 x i32], align 4
 define dllexport amdgpu_gs void @gs_shader() #2 {
  %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
  store i32 0, ptr addrspace(3) %ptr, align 4
  ret void
 }
@LDS.HS = external addrspace(3) global [1024 x i32], align 4
 define dllexport amdgpu_hs void @hs_shader() #2 {
  %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
  store i32 0, ptr addrspace(3) %ptr, align 4
  ret void
 }
 !amdgpu.pal.metadata.msgpack = !{!0}
 ; Function Attrs: nounwind willreturn memory(none)
 declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare i64 @llvm.amdgcn.s.getpc() #2
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
 declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
 attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode" "amdgpu-dynamic-vgpr-block-size"="16" }
 attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
 !0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
 !1 = !{i32 7}
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12-dvgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12-dvgpr.mir
@ -0,0 +1,340 @@
 # RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
 --- |
  define amdgpu_ps void @tbuffer_store1() #0 { ret void }
  define amdgpu_ps void @tbuffer_store2() #0 { ret void }
  define amdgpu_ps void @flat_store() #0 { ret void }
  define amdgpu_ps void @global_store() #0 { ret void }
  define amdgpu_ps void @buffer_store_format() #0 { ret void }
  define amdgpu_ps void @ds_write_b32() #0 { ret void }
  define amdgpu_ps void @global_store_dword() #0 { ret void }
  define amdgpu_ps void @multiple_basic_blocks1() #0 { ret void }
  define amdgpu_ps void @multiple_basic_blocks2() #0 { ret void }
  define amdgpu_ps void @multiple_basic_blocks3() #0 { ret void }
  define amdgpu_ps void @recursive_loop() #0 { ret void }
  define amdgpu_ps void @recursive_loop_vmem() #0 { ret void }
  define amdgpu_ps void @image_store() #0 { ret void }
  define amdgpu_ps void @scratch_store() #0 { ret void }
  define amdgpu_ps void @buffer_atomic() #0 { ret void }
  define amdgpu_ps void @flat_atomic() #0 { ret void }
  define amdgpu_ps void @global_atomic() #0 { ret void }
  define amdgpu_ps void @image_atomic() #0 { ret void }
  define amdgpu_ps void @global_store_optnone() #1 { ret void }
  attributes #0 = { "amdgpu-dynamic-vgpr-block-size" = "16" }
  attributes #1 = { "amdgpu-dynamic-vgpr-block-size" = "16" noinline optnone }
 ...
 ---
 name:            tbuffer_store1
 body:             |
  bb.0:
    ; CHECK-LABEL: name: tbuffer_store1
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
    S_ENDPGM 0
 ...
 ---
 name:            tbuffer_store2
 body:             |
  bb.0:
    ; CHECK-LABEL: name: tbuffer_store2
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
    S_ENDPGM 0
 ...
 ---
 name:            flat_store
 body:             |
  bb.0:
    ; CHECK-LABEL: name: flat_store
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
    S_ENDPGM 0
 ...
 ---
 name:            global_store
 body:             |
  bb.0:
    ; CHECK-LABEL: name: global_store
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
    S_WAIT_STORECNT 0
    S_ENDPGM 0
 ...
 ---
 name:            buffer_store_format
 body:             |
  bb.0:
    ; CHECK-LABEL: name: buffer_store_format
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
    S_ENDPGM 0
 ...
 ---
 name:            ds_write_b32
 body:             |
  bb.0:
    ; CHECK-LABEL: name: ds_write_b32
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0 = IMPLICIT_DEF
    renamable $vgpr1 = IMPLICIT_DEF
    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0
    S_ENDPGM 0
 ...
 ---
 name:            global_store_dword
 body:             |
  bb.0:
    liveins: $vgpr0, $sgpr0_sgpr1
    ; CHECK-LABEL: name: global_store_dword
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
    GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
    S_ENDPGM 0
 ...
 ---
 name:            multiple_basic_blocks1
 body:             |
  ; CHECK-LABEL: name: multiple_basic_blocks1
  ; CHECK-NOT: S_SENDMSG 3
  ; CHECK: S_ALLOC_VGPR 0
  ; CHECK:   S_ENDPGM 0
  bb.0:
    successors: %bb.1
    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1
  bb.1:
    successors: %bb.1, %bb.2
    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2
  bb.2:
    S_ENDPGM 0
 ...
 ---
 name:            multiple_basic_blocks2
 body:             |
  ; CHECK-LABEL: name: multiple_basic_blocks2
  ; CHECK: bb.2:
  ; CHECK-NOT: S_SENDMSG 3
  ; CHECK: S_ALLOC_VGPR 0
  ; CHECK: S_ENDPGM 0
  bb.0:
    successors: %bb.2
    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
    S_BRANCH %bb.2
  bb.1:
    successors: %bb.2
    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
    S_BRANCH %bb.2
  bb.2:
    S_ENDPGM 0
 ...
 ---
 name:            multiple_basic_blocks3
 body:             |
  ; CHECK-LABEL: name: multiple_basic_blocks3
  ; CHECK: bb.4:
  ; CHECK-NOT: S_SENDMSG 3
  ; CHECK: S_ALLOC_VGPR 0
  ; CHECK: S_ENDPGM 0
  bb.0:
    successors: %bb.2
    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
    S_BRANCH %bb.2
  bb.1:
    successors: %bb.2
    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
    S_BRANCH %bb.2
  bb.2:
    successors: %bb.4
    S_BRANCH %bb.4
  bb.3:
    successors: %bb.4
    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
    S_BRANCH %bb.4
  bb.4:
    S_ENDPGM 0
 ...
 ---
 name:            recursive_loop
 body:             |
  ; CHECK-LABEL: name: recursive_loop
  ; CHECK-NOT: S_SENDMSG 3
  ; CHECK: S_ALLOC_VGPR 0
  ; CHECK:   S_ENDPGM 0
  bb.0:
    successors: %bb.1
    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1
  bb.1:
    successors: %bb.1, %bb.2
    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2
  bb.2:
    S_ENDPGM 0
 ...
 ---
 name:            recursive_loop_vmem
 body:             |
  ; CHECK-LABEL: name: recursive_loop_vmem
  ; CHECK-NOT: S_SENDMSG 3
  ; CHECK: S_ALLOC_VGPR 0
  ; CHECK: S_ENDPGM 0
  bb.0:
    successors: %bb.1
    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
    S_BRANCH %bb.1
  bb.1:
    successors: %bb.1, %bb.2
    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
    S_BRANCH %bb.2
  bb.2:
    S_ENDPGM 0
 ...
 ---
 name:            image_store
 body:             |
  bb.0:
  ; CHECK-LABEL: name: image_store
  ; CHECK-NOT: S_SENDMSG 3
  ; CHECK: S_ALLOC_VGPR 0
  ; CHECK: S_ENDPGM 0
  IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
  S_ENDPGM 0
 ...
 ---
 name:            scratch_store
 body:             |
  bb.0:
    ; CHECK-LABEL: name: scratch_store
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
    SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
    S_ENDPGM 0
 ...
 ---
 name:            buffer_atomic
 body:             |
  bb.0:
    ; CHECK-LABEL: name: buffer_atomic
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
    S_ENDPGM 0
 ...
 ---
 name:            flat_atomic
 body:             |
  bb.0:
    ; CHECK-LABEL: name: flat_atomic
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
    S_ENDPGM 0
 ...
 ---
 name:            global_atomic
 body:             |
  bb.0:
    ; CHECK-LABEL: name: global_atomic
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
    S_ENDPGM 0
 ...
 ---
 name:            image_atomic
 body:             |
  bb.0:
    ; CHECK-LABEL: name: image_atomic
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
    S_ENDPGM 0
 ...
 ---
 name:            global_store_optnone
 body:             |
  bb.0:
    ; CHECK-LABEL: name: global_store_optnone
    ; CHECK-NOT: S_SENDMSG 3
    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
    S_WAIT_STORECNT 0
    S_ENDPGM 0
 ...
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
@ -1,5 +1,4 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
 # RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
 --- |
  define amdgpu_ps void @tbuffer_store1() { ret void }
@ -29,8 +28,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: tbuffer_store1
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
    S_ENDPGM 0
@ -42,8 +40,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: tbuffer_store2
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
    S_ENDPGM 0
@ -55,8 +52,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: flat_store
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
    S_ENDPGM 0
@ -68,8 +64,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: global_store
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
    S_WAIT_STORECNT 0
@ -82,8 +77,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: buffer_store_format
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
    S_ENDPGM 0
@ -95,8 +89,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: ds_write_b32
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0 = IMPLICIT_DEF
    renamable $vgpr1 = IMPLICIT_DEF
@ -112,8 +105,7 @@ body:             |
    ; CHECK-LABEL: name: global_store_dword
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
    GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
@ -125,8 +117,7 @@ name:            multiple_basic_blocks1
 body:             |
  ; CHECK-LABEL: name: multiple_basic_blocks1
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; DVGPR: S_ALLOC_VGPR 0
  ; CHECK:   S_ENDPGM 0
  bb.0:
    successors: %bb.1
@ -153,8 +144,7 @@ body:             |
  ; CHECK-LABEL: name: multiple_basic_blocks2
  ; CHECK: bb.2:
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; DVGPR: S_ALLOC_VGPR 0
  ; CHECK: S_ENDPGM 0
  bb.0:
    successors: %bb.2
@ -180,8 +170,7 @@ body:             |
  ; CHECK-LABEL: name: multiple_basic_blocks3
  ; CHECK: bb.4:
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; DVGPR: S_ALLOC_VGPR 0
  ; CHECK: S_ENDPGM 0
  bb.0:
    successors: %bb.2
@ -216,8 +205,7 @@ name:            recursive_loop
 body:             |
  ; CHECK-LABEL: name: recursive_loop
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; DVGPR: S_ALLOC_VGPR 0
  ; CHECK:   S_ENDPGM 0
  bb.0:
    successors: %bb.1
@ -241,8 +229,7 @@ name:            recursive_loop_vmem
 body:             |
  ; CHECK-LABEL: name: recursive_loop_vmem
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; DVGPR: S_ALLOC_VGPR 0
  ; CHECK: S_ENDPGM 0
  bb.0:
    successors: %bb.1
@ -268,8 +255,7 @@ body:             |
  bb.0:
  ; CHECK-LABEL: name: image_store
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; DVGPR: S_ALLOC_VGPR 0
  ; CHECK: S_ENDPGM 0
  IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
  S_ENDPGM 0
@ -281,8 +267,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: scratch_store
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
    SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
@ -295,8 +280,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: buffer_atomic
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
    S_ENDPGM 0
@ -308,8 +292,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: flat_atomic
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
    S_ENDPGM 0
@ -322,8 +305,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: global_atomic
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
    S_ENDPGM 0
@ -335,8 +317,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: image_atomic
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; DVGPR: S_ALLOC_VGPR 0
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
    S_ENDPGM 0
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@ -44,6 +44,7 @@
 ; CHECK-NEXT:   sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT:   longBranchReservedReg: ''
 ; CHECK-NEXT:   hasInitWholeWave: false
 ; CHECK-NEXT:   dynamicVGPRBlockSize: 0
 ; CHECK-NEXT:   scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
  define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
@ -312,6 +313,7 @@
 ; CHECK-NEXT:   sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT:   longBranchReservedReg: ''
 ; CHECK-NEXT:   hasInitWholeWave: false
 ; CHECK-NEXT:   dynamicVGPRBlockSize: 0
 ; CHECK-NEXT:   scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
  define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@ -44,6 +44,7 @@
 ; AFTER-PEI-NEXT: sgprForEXECCopy: ''
 ; AFTER-PEI-NEXT: longBranchReservedReg: ''
 ; AFTER-PEI-NEXT: hasInitWholeWave: false
 ; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0
 ; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
 ; AFTER-PEI-NEXT: body:
 define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@ -44,6 +44,7 @@
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
  define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@ -44,6 +44,7 @@
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@ -53,6 +53,7 @@
 # FULL-NEXT:  sgprForEXECCopy: ''
 # FULL-NEXT:  longBranchReservedReg: ''
 # FULL-NEXT:  hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
 # FULL-NEXT: body:
@ -159,6 +160,7 @@ body:             |
 # FULL-NEXT: sgprForEXECCopy: ''
 # FULL-NEXT: longBranchReservedReg: ''
 # FULL-NEXT: hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
 # FULL-NEXT: body:
@ -236,6 +238,7 @@ body:             |
 # FULL-NEXT: sgprForEXECCopy: ''
 # FULL-NEXT: longBranchReservedReg: ''
 # FULL-NEXT: hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
 # FULL-NEXT: body:
@ -314,6 +317,7 @@ body:             |
 # FULL-NEXT: sgprForEXECCopy: ''
 # FULL-NEXT: longBranchReservedReg: ''
 # FULL-NEXT: hasInitWholeWave: false
 # FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
 # FULL-NEXT: body:
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@ -54,6 +54,7 @@
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
@ -102,6 +103,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
 define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
@ -174,6 +176,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
 define void @function() {
@ -228,6 +231,7 @@ define void @function() {
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: hasInitWholeWave: false
 ; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
 define void @function_nsz() #0 {
--- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@ -93,16 +93,16 @@ static const std::pair<StringRef, StringRef>
  W32FS = {"+wavefrontsize32", "w32"},
  W64FS = {"+wavefrontsize64", "w64"};
-using TestFuncTy =
+using TestFuncTy = function_ref<bool(std::stringstream &, unsigned,
-    function_ref<bool(std::stringstream &, unsigned, const GCNSubtarget &)>;
+                                     const GCNSubtarget &, bool)>;
 static bool testAndRecord(std::stringstream &Table, const GCNSubtarget &ST,
-                          TestFuncTy test) {
+                          TestFuncTy test, unsigned DynamicVGPRBlockSize) {
  bool Success = true;
  unsigned MaxOcc = ST.getMaxWavesPerEU();
  for (unsigned Occ = MaxOcc; Occ > 0; --Occ) {
    Table << std::right << std::setw(3) << Occ << "    ";
-    Success = test(Table, Occ, ST) && Success;
+    Success = test(Table, Occ, ST, DynamicVGPRBlockSize) && Success;
    Table << '\n';
  }
  return Success;
@ -132,7 +132,7 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
        FS = &W32FS;
      std::stringstream Table;
-      bool Success = testAndRecord(Table, ST, test);
+      bool Success = testAndRecord(Table, ST, test, /*DynamicVGPRBlockSize=*/0);
      if (!Success || PrintCpuRegLimits)
        TablePerCPUs[Table.str()].push_back((CanonCPUName + FS->second).str());
@ -155,40 +155,50 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
 static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
                                  TestFuncTy test) {
-  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
+  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
                                      "+dynamic-vgpr," + FS.str());
  ASSERT_TRUE(TM) << "No target machine";
  GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
                  std::string(TM->getTargetFeatureString()), *TM);
  ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
  auto testWithBlockSize = [&](unsigned DynamicVGPRBlockSize) {
    std::stringstream Table;
-  bool Success = testAndRecord(Table, ST, test);
+    bool Success = testAndRecord(Table, ST, test, DynamicVGPRBlockSize);
    EXPECT_TRUE(Success && !PrintCpuRegLimits)
-      << CPUName << " dynamic VGPR " << FS
+        << CPUName << " dynamic VGPR block size " << DynamicVGPRBlockSize
        << ":\nOcc    MinVGPR        MaxVGPR\n"
        << Table.str() << '\n';
  };
  testWithBlockSize(16);
  testWithBlockSize(32);
 }
 TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
-  auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
+  auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST,
-    unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
+                 unsigned DynamicVGPRBlockSize) {
    unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(DynamicVGPRBlockSize);
    return checkMinMax(
-        OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum), ST.getMaxWavesPerEU(),
+        OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum, DynamicVGPRBlockSize),
-        [&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs); },
+        ST.getMaxWavesPerEU(),
-        [&](unsigned Occ) { return ST.getMinNumVGPRs(Occ); },
+        [&](unsigned NumGPRs) {
-        [&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ); });
+          return ST.getOccupancyWithNumVGPRs(NumGPRs, DynamicVGPRBlockSize);
        },
        [&](unsigned Occ) {
          return ST.getMinNumVGPRs(Occ, DynamicVGPRBlockSize);
        },
        [&](unsigned Occ) {
          return ST.getMaxNumVGPRs(Occ, DynamicVGPRBlockSize);
        });
  };
  testGPRLimits("VGPR", true, test);
  testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
  testDynamicVGPRLimits("gfx1200",
                        "+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
 }
 static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
                               unsigned DynamicVGPRBlockSize,
                               unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
                               unsigned ExpectedMaxVGPRs) {
  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
@ -206,11 +216,15 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
  Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
  Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
  std::string DVGPRBlockSize = std::to_string(DynamicVGPRBlockSize);
  if (DynamicVGPRBlockSize)
    Func->addFnAttr("amdgpu-dynamic-vgpr-block-size", DVGPRBlockSize);
  auto Range = ST.getWavesPerEU(*Func);
  EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
  EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
  EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
-  EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
+  EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs(DynamicVGPRBlockSize))
      << CPUName << ' ' << FS;
  // Function with requested 'amdgpu-waves-per-eu' in a valid range.
@ -221,11 +235,10 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
 }
 TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
-  testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
+  // CPUName, Features, DynamicVGPRBlockSize; Expected MinOcc, MaxOcc, MaxVGPRs
-  testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
+  testAbsoluteLimits("gfx1200", "+wavefrontsize32", 0, 1, 16, 256);
-  testAbsoluteLimits(
+  testAbsoluteLimits("gfx1200", "+wavefrontsize32", 16, 1, 16, 128);
-      "gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
+  testAbsoluteLimits("gfx1200", "+wavefrontsize32", 32, 1, 16, 256);
      1, 16, 256);
 }
 static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {