[AMDGPU] Replace dynamic VGPR feature with attribute (#133444)

Use a function attribute (amdgpu-dynamic-vgpr) instead of a subtarget feature, as requested in #130030.
2025-06-24 11:09:36 +02:00 · 2025-06-24 11:09:36 +02:00 · a201f8872a
commit a201f8872a
parent 6cfa03f1f1
34 changed files with 1205 additions and 199 deletions
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@ -768,11 +768,6 @@ For example:
                                                  performant than code generated for XNACK replay
                                                  disabled.

-     dynamic-vgpr    TODO                         Represents the "Dynamic VGPR" hardware mode, introduced in GFX12.
-                                                  Waves launched in this mode may allocate or deallocate the VGPRs
-                                                  using dedicated instructions, but may not send the DEALLOC_VGPRS
-                                                  message.
-
     =============== ============================ ==================================================

 .. _amdgpu-target-id:
@ -1764,6 +1759,15 @@ The AMDGPU backend supports the following LLVM IR attributes.

     "amdgpu-promote-alloca-to-vector-vgpr-ratio"     Ratio of VGPRs to budget for promoting alloca to vectors.

+     "amdgpu-dynamic-vgpr-block-size"                 Represents the size of a VGPR block in the "Dynamic VGPR" hardware mode,
+                                                      introduced in GFX12.
+                                                      A value of 0 (default) means that dynamic VGPRs are not enabled.
+                                                      Valid values for GFX12+ are 16 and 32.
+                                                      Waves launched in this mode may allocate or deallocate the VGPRs
+                                                      using dedicated instructions, but may not send the DEALLOC_VGPRS
+                                                      message. If a shader has this attribute, then all its callees must
+                                                      match its value.
+
     ================================================ ==========================================================

 Calling Conventions
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@ -1281,12 +1281,14 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
   "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
 >;

+// FIXME: Remove after all users are migrated to attribute.
 def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
  "DynamicVGPR",
  "true",
  "Enable dynamic VGPR mode"
 >;

+// FIXME: Remove after all users are migrated to attribute.
 def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
  "DynamicVGPRBlockSize32",
  "true",
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@ -452,15 +452,17 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
      unsigned MaxWaves = MFI.getMaxWavesPerEU();
      uint64_t TotalNumVgpr =
          getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
-      uint64_t NumVGPRsForWavesPerEU = std::max(
-          {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
+      uint64_t NumVGPRsForWavesPerEU =
+          std::max({TotalNumVgpr, (uint64_t)1,
+                    (uint64_t)STM.getMinNumVGPRs(
+                        MaxWaves, MFI.getDynamicVGPRBlockSize())});
      uint64_t NumSGPRsForWavesPerEU = std::max(
          {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
      const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
          STM.getOccupancyWithWorkGroupSizes(*MF).second,
          MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
-          MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
-          OutContext);
+          MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
+          MFI.getDynamicVGPRBlockSize(), STM, OutContext);
      uint64_t Occupancy;

      const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
@ -1082,7 +1084,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                              Ctx);
  ProgInfo.NumVGPRsForWavesPerEU =
      AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
-                               CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
+                               CreateExpr(STM.getMinNumVGPRs(
+                                   MaxWaves, MFI->getDynamicVGPRBlockSize()))},
                              Ctx);

  if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
@ -1256,7 +1259,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,

  ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
      STM.computeOccupancy(F, ProgInfo.LDSSize).second,
-      ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+      ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
+      MFI->getDynamicVGPRBlockSize(), STM, Ctx);

  const auto [MinWEU, MaxWEU] =
      AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
@ -1405,7 +1409,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
 // Helper function to add common PAL Metadata 3.0+
 static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
                                  const SIProgramInfo &CurrentProgramInfo,
-                                  CallingConv::ID CC, const GCNSubtarget &ST) {
+                                  CallingConv::ID CC, const GCNSubtarget &ST,
+                                  unsigned DynamicVGPRBlockSize) {
  if (ST.hasIEEEMode())
    MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);

@ -1417,7 +1422,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
                   (bool)CurrentProgramInfo.TrapHandlerEnable);
    MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);

-    if (ST.isDynamicVGPREnabled())
+    if (DynamicVGPRBlockSize != 0)
      MD->setComputeRegisters(".dynamic_vgpr_en", true);
  }

@ -1444,7 +1449,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
  // For targets that support dynamic VGPRs, set the number of saved dynamic
  // VGPRs (if any) in the PAL metadata.
  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
-  if (STM.isDynamicVGPREnabled() &&
+  if (MFI->isDynamicVGPREnabled() &&
      MFI->getScratchReservedForDynamicVGPRs() > 0)
    MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
                   MFI->getScratchReservedForDynamicVGPRs() / 4);
@ -1470,7 +1475,8 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
    MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
    MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
                   CurrentProgramInfo.ScratchEnable);
-    EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
+    EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
+                          MFI->getDynamicVGPRBlockSize());
  }

  // ScratchSize is in bytes, 16 aligned.
@ -1541,7 +1547,9 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
    MD->setRsrc2(CallingConv::AMDGPU_CS,
                 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
  } else {
-    EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
+    EmitPALMetadataCommon(
+        MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
+        MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
  }

  // Set optional info
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@ -173,8 +173,16 @@ static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
    return 128;

  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+
+  unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
+  // Temporarily check both the attribute and the subtarget feature, until the
+  // latter is removed.
+  if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
+    DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
+
  unsigned MaxVGPRs = ST.getMaxNumVGPRs(
-      ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first);
+      ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
+      DynamicVGPRBlockSize);

  // A non-entry function has only 32 caller preserved registers.
  // Do not promote alloca which will force spilling unless we know the function
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@ -448,7 +448,10 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
 unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
  // TODO: assert Regions are sorted descending by pressure
  const auto &ST = MF.getSubtarget<GCNSubtarget>();
-  const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+  const unsigned DynamicVGPRBlockSize =
+      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+  const auto Occ =
+      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
  LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
                    << ", current = " << Occ << '\n');

@ -457,7 +460,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
    // Always build the DAG to add mutations
    BuildDAG DAG(*R, *this);

-    if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
+    if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc)
      continue;

    LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
@ -468,7 +471,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
    LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
               printSchedRP(dbgs(), R->MaxPressure, MaxRP));

-    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
+    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize));
    if (NewOcc <= Occ)
      break;

@ -489,9 +492,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
  const auto &ST = MF.getSubtarget<GCNSubtarget>();
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
  auto TgtOcc = MFI->getMinAllowedOccupancy();
+  unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();

  sortRegionsByPressure(TgtOcc);
-  auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+  auto Occ =
+      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);

  bool IsReentry = false;
  if (TryMaximizeOccupancy && Occ < TgtOcc) {
@ -522,19 +527,21 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
      const auto RP = getRegionPressure(*R);
      LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));

-      if (RP.getOccupancy(ST) < TgtOcc) {
+      if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
        LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
-        if (R->BestSchedule.get() &&
-            R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+        if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
+                                         ST, DynamicVGPRBlockSize) >= TgtOcc) {
          LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
          scheduleBest(*R);
        } else {
          LLVM_DEBUG(dbgs() << ", restoring\n");
          Ovr.restoreOrder();
-          assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
+          assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >=
+                 TgtOcc);
        }
      }
-      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
+      FinalOccupancy =
+          std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
    }
  }
  MFI->limitOccupancy(FinalOccupancy);
@ -580,9 +587,11 @@ void GCNIterativeScheduler::scheduleILP(
  const auto &ST = MF.getSubtarget<GCNSubtarget>();
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
  auto TgtOcc = MFI->getMinAllowedOccupancy();
+  unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();

  sortRegionsByPressure(TgtOcc);
-  auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+  auto Occ =
+      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);

  bool IsReentry = false;
  if (TryMaximizeOccupancy && Occ < TgtOcc) {
@ -603,17 +612,18 @@ void GCNIterativeScheduler::scheduleILP(
    const auto RP = getSchedulePressure(*R, ILPSchedule);
    LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));

-    if (RP.getOccupancy(ST) < TgtOcc) {
+    if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
      LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
-      if (R->BestSchedule.get() &&
-        R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+      if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
+                                       ST, DynamicVGPRBlockSize) >= TgtOcc) {
        LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
        scheduleBest(*R);
      }
    } else {
      scheduleRegion(*R, ILPSchedule, RP);
      LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
-      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
+      FinalOccupancy =
+          std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
    }
  }
  MFI->limitOccupancy(FinalOccupancy);
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@ -251,7 +251,9 @@ bool GCNNSAReassignImpl::run(MachineFunction &MF) {

  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
-  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
+  MaxNumVGPRs = std::min(
+      ST->getMaxNumVGPRs(MFI->getOccupancy(), MFI->getDynamicVGPRBlockSize()),
+      MaxNumVGPRs);
  CSRegs = MRI->getCalleeSavedRegs();

  using Candidate = std::pair<const MachineInstr*, bool>;
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@ -13,6 +13,7 @@

 #include "GCNRegPressure.h"
 #include "AMDGPU.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"

 using namespace llvm;
@ -94,17 +95,20 @@ void GCNRegPressure::inc(unsigned Reg,
 bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                          unsigned MaxOccupancy) const {
  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  unsigned DynamicVGPRBlockSize =
+      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();

  const auto SGPROcc = std::min(MaxOccupancy,
                                ST.getOccupancyWithNumSGPRs(getSGPRNum()));
-  const auto VGPROcc =
-    std::min(MaxOccupancy,
-             ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
+  const auto VGPROcc = std::min(
+      MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
+                                                DynamicVGPRBlockSize));
  const auto OtherSGPROcc = std::min(MaxOccupancy,
                                ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
  const auto OtherVGPROcc =
-    std::min(MaxOccupancy,
-             ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts())));
+      std::min(MaxOccupancy,
+               ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()),
+                                           DynamicVGPRBlockSize));

  const auto Occ = std::min(SGPROcc, VGPROcc);
  const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@ -226,13 +230,15 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                          O.getVGPRNum(ST.hasGFX90AInsts()));
 }

-Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
-  return Printable([&RP, ST](raw_ostream &OS) {
+Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
+                      unsigned DynamicVGPRBlockSize) {
+  return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
    OS << "VGPRs: " << RP.getArchVGPRNum() << ' '
       << "AGPRs: " << RP.getAGPRNum();
    if (ST)
      OS << "(O"
-         << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()))
+         << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()),
+                                         DynamicVGPRBlockSize)
         << ')';
    OS << ", SGPRs: " << RP.getSGPRNum();
    if (ST)
@ -240,7 +246,7 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
    OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
       << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
    if (ST)
-      OS << " -> Occ: " << RP.getOccupancy(*ST);
+      OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
    OS << '\n';
  });
 }
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@ -69,9 +69,11 @@ struct GCNRegPressure {
  }
  unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }

-  unsigned getOccupancy(const GCNSubtarget &ST) const {
+  unsigned getOccupancy(const GCNSubtarget &ST,
+                        unsigned DynamicVGPRBlockSize) const {
    return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
-             ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
+                    ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
+                                                DynamicVGPRBlockSize));
  }

  void inc(unsigned Reg,
@ -79,8 +81,10 @@ struct GCNRegPressure {
           LaneBitmask NewMask,
           const MachineRegisterInfo &MRI);

-  bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const {
-    return getOccupancy(ST) > O.getOccupancy(ST);
+  bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O,
+                       unsigned DynamicVGPRBlockSize) const {
+    return getOccupancy(ST, DynamicVGPRBlockSize) >
+           O.getOccupancy(ST, DynamicVGPRBlockSize);
  }

  /// Compares \p this GCNRegpressure to \p O, returning true if \p this is
@ -133,7 +137,8 @@ private:
  friend GCNRegPressure max(const GCNRegPressure &P1,
                            const GCNRegPressure &P2);

-  friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST);
+  friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST,
+                         unsigned DynamicVGPRBlockSize);
 };

 inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
@ -402,7 +407,8 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
 bool isEqual(const GCNRPTracker::LiveRegSet &S1,
             const GCNRPTracker::LiveRegSet &S2);

-Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr);
+Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr,
+                unsigned DynamicVGPRBlockSize = 0);

 Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
                const MachineRegisterInfo &MRI);
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@ -99,17 +99,20 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
      std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);

  if (!KnownExcessRP) {
-    VGPRCriticalLimit =
-        std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
+    VGPRCriticalLimit = std::min(
+        ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
+        VGPRExcessLimit);
  } else {
    // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
    // returns a reasonably small number for targets with lots of VGPRs, such
    // as GFX10 and GFX11.
    LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
                         "VGPRCriticalLimit calculation method.\n");
-
-    unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
-    unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
+    unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
+    unsigned Granule =
+        AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
+    unsigned Addressable =
+        AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
    unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
    VGPRBudget = std::max(VGPRBudget, Granule);
    VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
@ -1136,7 +1139,8 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
  if (DAG.MinOccupancy > InitialOccupancy) {
    for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
      DAG.RegionsWithMinOcc[IDX] =
-          DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
+          DAG.Pressure[IDX].getOccupancy(
+              DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;

    LLVM_DEBUG(dbgs() << StageID
                      << " stage successfully increased occupancy to "
@ -1273,11 +1277,14 @@ void GCNSchedStage::checkScheduling() {
  LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
  LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");

+  unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
+
  if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
      PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
    DAG.Pressure[RegionIdx] = PressureAfter;
    DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
+        DAG.MinOccupancy;

    // Early out if we have achieved the occupancy target.
    LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
@ -1286,10 +1293,10 @@ void GCNSchedStage::checkScheduling() {

  unsigned TargetOccupancy = std::min(
      S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
-  unsigned WavesAfter =
-      std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
-  unsigned WavesBefore =
-      std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
+  unsigned WavesAfter = std::min(
+      TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
+  unsigned WavesBefore = std::min(
+      TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
  LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
                    << ", after " << WavesAfter << ".\n");

@ -1338,7 +1345,8 @@ void GCNSchedStage::checkScheduling() {
  } else {
    DAG.Pressure[RegionIdx] = PressureAfter;
    DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
+        DAG.MinOccupancy;
  }
 }

@ -1461,11 +1469,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
    return true;

  // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
-  if (ST.isDynamicVGPREnabled()) {
+  if (DAG.MFI.isDynamicVGPREnabled()) {
    unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
-        &ST, PressureBefore.getVGPRNum(false));
+        &ST, DAG.MFI.getDynamicVGPRBlockSize(),
+        PressureBefore.getVGPRNum(false));
    unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
-        &ST, PressureAfter.getVGPRNum(false));
+        &ST, DAG.MFI.getDynamicVGPRBlockSize(),
+        PressureAfter.getVGPRNum(false));
    if (BlocksAfter > BlocksBefore)
      return true;
  }
@ -1489,7 +1499,8 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
  // If RP is not reduced in the unclustered reschedule stage, revert to the
  // old schedule.
-  if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
+  if ((WavesAfter <=
+           PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
       mayCauseSpilling(WavesAfter)) ||
      GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
    LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
@ -1511,8 +1522,9 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
  ScheduleMetrics MAfter = getScheduleMetrics(DAG);
  unsigned OldMetric = MBefore.getMetric();
  unsigned NewMetric = MAfter.getMetric();
-  unsigned WavesBefore =
-      std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
+  unsigned WavesBefore = std::min(
+      S.getTargetOccupancy(),
+      PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
  unsigned Profit =
      ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
       ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
@ -1566,7 +1578,8 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {

 void GCNSchedStage::revertScheduling() {
  DAG.RegionsWithMinOcc[RegionIdx] =
-      PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
+      PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) ==
+      DAG.MinOccupancy;
  LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
  DAG.RegionEnd = DAG.RegionBegin;
  int SkippedDebugInstr = 0;
@ -1844,13 +1857,16 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
  // occupancy, or regions with VGPR spilling) to a model of their excess RP.
  DenseMap<unsigned, ExcessRP> OptRegions;
  const Function &F = MF.getFunction();
+  unsigned DynamicVGPRBlockSize =
+      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();

  std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
  const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
  const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
  const unsigned MaxSGPRsIncOcc =
      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
-  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
+  const unsigned MaxVGPRsIncOcc =
+      ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
  IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;

  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
@ -2163,7 +2179,9 @@ void PreRARematStage::rematerialize() {
      }
    }
    DAG.Pressure[I] = RP;
-    AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
+    AchievedOcc = std::min(
+        AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
+                                             ->getDynamicVGPRBlockSize()));
  }
  REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
 }
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@ -366,8 +366,11 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
                                                   getGeneration());
 }

-unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
-  return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
+unsigned
+GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
+                                       unsigned DynamicVGPRBlockSize) const {
+  return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs,
+                                                       DynamicVGPRBlockSize);
 }

 unsigned
@ -403,9 +406,15 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
 std::pair<unsigned, unsigned>
 GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
                               unsigned NumSGPRs, unsigned NumVGPRs) const {
+  unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
+  // Temporarily check both the attribute and the subtarget feature until the
+  // latter is removed.
+  if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
+    DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
+
  auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
  unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
-  unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
+  unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);

  // Maximum occupancy may be further limited by high SGPR/VGPR usage.
  MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
@ -512,9 +521,16 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
 }

 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
+  // Temporarily check both the attribute and the subtarget feature, until the
+  // latter is removed.
+  unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
+  if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
+    DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
+
  std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
  return getBaseMaxNumVGPRs(
-      F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
+      F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
+          getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
 }

 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@ -1391,7 +1391,8 @@ public:

  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
  /// VGPRs
-  unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+  unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
+                                    unsigned DynamicVGPRBlockSize) const;

  /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
  /// be achieved when the only function running on a CU is \p F, each workgroup
@ -1549,8 +1550,8 @@ public:
  unsigned getMaxNumSGPRs(const Function &F) const;

  /// \returns VGPR allocation granularity supported by the subtarget.
-  unsigned getVGPRAllocGranule() const {
-    return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
+  unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
+    return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
  }

  /// \returns VGPR encoding granularity supported by the subtarget.
@ -1570,20 +1571,24 @@ public:
  }

  /// \returns Addressable number of VGPRs supported by the subtarget.
-  unsigned getAddressableNumVGPRs() const {
-    return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
+  unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
+    return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
  }

  /// \returns the minimum number of VGPRs that will prevent achieving more than
  /// the specified number of waves \p WavesPerEU.
-  unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
+  unsigned getMinNumVGPRs(unsigned WavesPerEU,
+                          unsigned DynamicVGPRBlockSize) const {
+    return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
+                                           DynamicVGPRBlockSize);
  }

  /// \returns the maximum number of VGPRs that can be used and still achieved
  /// at least the specified number of waves \p WavesPerEU.
-  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
+  unsigned getMaxNumVGPRs(unsigned WavesPerEU,
+                          unsigned DynamicVGPRBlockSize) const {
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
+                                           DynamicVGPRBlockSize);
  }

  /// \returns max num VGPRs. This is the common utility function
@ -1686,6 +1691,9 @@ public:
  }

  bool isDynamicVGPREnabled() const { return DynamicVGPR; }
+  unsigned getDynamicVGPRBlockSize() const {
+    return DynamicVGPRBlockSize32 ? 32 : 16;
+  }

  bool requiresDisjointEarlyClobberAndUndef() const override {
    // AMDGPU doesn't care if early-clobber and undef operands are allocated
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@ -313,13 +313,11 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
 /// Remove dependency on GCNSubtarget and depend only only the necessary values
 /// for said occupancy computation. Should match computeOccupancy implementation
 /// without passing \p STM on.
-const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc,
-                                                  const MCExpr *NumSGPRs,
-                                                  const MCExpr *NumVGPRs,
-                                                  const GCNSubtarget &STM,
-                                                  MCContext &Ctx) {
+const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
+    unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
+    unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
-  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
+  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
  unsigned Generation = STM.getGeneration();

--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@ -93,11 +93,10 @@ public:
    return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
  }

-  static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc,
-                                             const MCExpr *NumSGPRs,
-                                             const MCExpr *NumVGPRs,
-                                             const GCNSubtarget &STM,
-                                             MCContext &Ctx);
+  static const AMDGPUMCExpr *
+  createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+                  const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
+                  const GCNSubtarget &STM, MCContext &Ctx);

  ArrayRef<const MCExpr *> getArgs() const { return Args; }
  VariantKind getKind() const { return Kind; }
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@ -197,7 +197,9 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
  // pointer becomes dead and could otherwise be reused for destination.
  RPT.advanceToNext();
  GCNRegPressure MaxPressure = RPT.moveMaxPressure();
-  unsigned Occupancy = MaxPressure.getOccupancy(*ST);
+  unsigned Occupancy = MaxPressure.getOccupancy(
+      *ST,
+      MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());

  // Don't push over half the register budget. We don't want to introduce
  // spilling just to form a soft clause.
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@ -714,11 +714,12 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
    assert(hasFP(MF));
    Register FPReg = MFI->getFrameOffsetReg();
    assert(FPReg != AMDGPU::FP_REG);
-    unsigned VGPRSize =
-        llvm::alignTo((ST.getAddressableNumVGPRs() -
-                       AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) *
-                          4,
-                      FrameInfo.getMaxAlign());
+    unsigned VGPRSize = llvm::alignTo(
+        (ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) -
+         AMDGPU::IsaInfo::getVGPRAllocGranule(&ST,
+                                              MFI->getDynamicVGPRBlockSize())) *
+            4,
+        FrameInfo.getMaxAlign());
    MFI->setScratchReservedForDynamicVGPRs(VGPRSize);

    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
@ -2087,7 +2088,7 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {

 bool SIFrameLowering::mayReserveScratchForCWSR(
    const MachineFunction &MF) const {
-  return MF.getSubtarget<GCNSubtarget>().isDynamicVGPREnabled() &&
+  return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
         AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) &&
         AMDGPU::isCompute(MF.getFunction().getCallingConv());
 }
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@ -1760,7 +1760,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
  else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
           MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
    if (!WCG->isOptNone() &&
-        (ST->isDynamicVGPREnabled() ||
+        (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
         (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
          ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
          !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
@ -2652,7 +2652,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
  Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
  Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);

-  [[maybe_unused]] unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
+  [[maybe_unused]] unsigned NumVGPRsMax =
+      ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
  [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
@ -2821,7 +2822,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
  // (i.e. whether we're in dynamic VGPR mode or not).
  // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
  // waveslot limited kernel runs slower with the deallocation.
-  if (ST->isDynamicVGPREnabled()) {
+  if (MFI->isDynamicVGPREnabled()) {
    for (MachineInstr *MI : ReleaseVGPRInsts) {
      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
              TII->get(AMDGPU::S_ALLOC_VGPR))
@ -2832,7 +2833,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
    if (!ReleaseVGPRInsts.empty() &&
        (MF.getFrameInfo().hasCalls() ||
         ST->getOccupancyWithNumVGPRs(
-             TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
+             TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
+             /*IsDynamicVGPR=*/false) <
             AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
      for (MachineInstr *MI : ReleaseVGPRInsts) {
        if (ST->requiresNopBeforeDeallocVGPRs()) {
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@ -48,6 +48,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
  MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
  assert(MaxNumWorkGroups.size() == 3);

+  // Temporarily check both the attribute and the subtarget feature, until the
+  // latter is completely removed.
+  DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
+  if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
+    DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
+
  Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
  CallingConv::ID CC = F.getCallingConv();

@ -716,6 +722,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
      PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
      MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
      Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
+      DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
      ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
  for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
    SpillPhysVGPRS.push_back(regToString(Reg, TRI));
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@ -299,6 +299,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {

  bool HasInitWholeWave = false;

+  unsigned DynamicVGPRBlockSize = 0;
  unsigned ScratchReservedForDynamicVGPRs = 0;

  SIMachineFunctionInfo() = default;
@ -352,6 +353,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
    YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
                       StringValue());
    YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
+    YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
    YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
                       MFI.ScratchReservedForDynamicVGPRs, 0);
  }
@ -469,6 +471,8 @@ private:
  unsigned NumSpilledSGPRs = 0;
  unsigned NumSpilledVGPRs = 0;

+  unsigned DynamicVGPRBlockSize = 0;
+
  // The size in bytes of the scratch space reserved for the CWSR trap handler
  // to spill some of the dynamic VGPRs.
  unsigned ScratchReservedForDynamicVGPRs = 0;
@ -820,6 +824,9 @@ public:
    BytesInStackArgArea = Bytes;
  }

+  bool isDynamicVGPREnabled() const { return DynamicVGPRBlockSize != 0; }
+  unsigned getDynamicVGPRBlockSize() const { return DynamicVGPRBlockSize; }
+
  // This is only used if we need to save any dynamic VGPRs in scratch.
  unsigned getScratchReservedForDynamicVGPRs() const {
    return ScratchReservedForDynamicVGPRs;
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@ -3748,7 +3748,11 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
  default:
    return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
  case AMDGPU::VGPR_32RegClassID:
-    return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF));
+    return std::min(
+        ST.getMaxNumVGPRs(
+            MinOcc,
+            MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize()),
+        ST.getMaxNumVGPRs(MF));
  case AMDGPU::SGPR_32RegClassID:
  case AMDGPU::SGPR_LO16RegClassID:
    return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@ -1158,10 +1158,16 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
 }

 unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
+                             unsigned DynamicVGPRBlockSize,
                             std::optional<bool> EnableWavefrontSize32) {
  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
    return 8;

+  if (DynamicVGPRBlockSize != 0)
+    return DynamicVGPRBlockSize;
+
+  // Temporarily check the subtarget feature, until we fully switch to using
+  // attributes.
  if (STI->getFeatureBits().test(FeatureDynamicVGPR))
    return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;

@ -1205,20 +1211,26 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {

 unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }

-unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
+                                unsigned DynamicVGPRBlockSize) {
  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
    return 512;
-  if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+
+  // Temporarily check the subtarget feature, until we fully switch to using
+  // attributes.
+  if (DynamicVGPRBlockSize != 0 ||
+      STI->getFeatureBits().test(FeatureDynamicVGPR))
    // On GFX12 we can allocate at most 8 blocks of VGPRs.
-    return 8 * getVGPRAllocGranule(STI);
+    return 8 * getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
  return getAddressableNumArchVGPRs(STI);
 }

 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
-                                      unsigned NumVGPRs) {
-  return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI),
-                                      getMaxWavesPerEU(STI),
-                                      getTotalNumVGPRs(STI));
+                                      unsigned NumVGPRs,
+                                      unsigned DynamicVGPRBlockSize) {
+  return getNumWavesPerEUWithNumVGPRs(
+      NumVGPRs, getVGPRAllocGranule(STI, DynamicVGPRBlockSize),
+      getMaxWavesPerEU(STI), getTotalNumVGPRs(STI));
 }

 unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
@ -1257,7 +1269,8 @@ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
  return 5;
 }

-unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
+                        unsigned DynamicVGPRBlockSize) {
  assert(WavesPerEU != 0);

  unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
@ -1265,28 +1278,33 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
    return 0;

  unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
-  unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI);
-  unsigned Granule = getVGPRAllocGranule(STI);
+  unsigned AddrsableNumVGPRs =
+      getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
+  unsigned Granule = getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
  unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);

  if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
    return 0;

-  unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs);
+  unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs,
+                                                        DynamicVGPRBlockSize);
  if (WavesPerEU < MinWavesPerEU)
-    return getMinNumVGPRs(STI, MinWavesPerEU);
+    return getMinNumVGPRs(STI, MinWavesPerEU, DynamicVGPRBlockSize);

  unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
  unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
  return std::min(MinNumVGPRs, AddrsableNumVGPRs);
 }

-unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
+                        unsigned DynamicVGPRBlockSize) {
  assert(WavesPerEU != 0);

  unsigned MaxNumVGPRs =
-      alignDown(getTotalNumVGPRs(STI) / WavesPerEU, getVGPRAllocGranule(STI));
-  unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI);
+      alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
+                getVGPRAllocGranule(STI, DynamicVGPRBlockSize));
+  unsigned AddressableNumVGPRs =
+      getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
  return std::min(MaxNumVGPRs, AddressableNumVGPRs);
 }

@ -1299,9 +1317,11 @@ unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,

 unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI,
                                   unsigned NumVGPRs,
+                                   unsigned DynamicVGPRBlockSize,
                                   std::optional<bool> EnableWavefrontSize32) {
  return getGranulatedNumRegisterBlocks(
-      NumVGPRs, getVGPRAllocGranule(STI, EnableWavefrontSize32));
+      NumVGPRs,
+      getVGPRAllocGranule(STI, DynamicVGPRBlockSize, EnableWavefrontSize32));
 }
 } // end namespace IsaInfo

@ -2124,6 +2144,16 @@ bool getHasDepthExport(const Function &F) {
  return F.getFnAttributeAsParsedInteger("amdgpu-depth-export", 0) != 0;
 }

+unsigned getDynamicVGPRBlockSize(const Function &F) {
+  unsigned BlockSize =
+      F.getFnAttributeAsParsedInteger("amdgpu-dynamic-vgpr-block-size", 0);
+
+  if (BlockSize == 16 || BlockSize == 32)
+    return BlockSize;
+
+  return 0;
+}
+
 bool hasXNACK(const MCSubtargetInfo &STI) {
  return STI.hasFeature(AMDGPU::FeatureXNACK);
 }
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@ -298,7 +298,7 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
 /// For subtargets which support it, \p EnableWavefrontSize32 should match
 /// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
 unsigned
-getVGPRAllocGranule(const MCSubtargetInfo *STI,
+getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize,
                    std::optional<bool> EnableWavefrontSize32 = std::nullopt);

 /// \returns VGPR encoding granularity for given subtarget \p STI.
@ -321,20 +321,24 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
 unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);

 /// \returns Addressable number of VGPRs for given subtarget \p STI.
-unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
+unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
+                                unsigned DynamicVGPRBlockSize);

 /// \returns Minimum number of VGPRs that meets given number of waves per
 /// execution unit requirement for given subtarget \p STI.
-unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
+                        unsigned DynamicVGPRBlockSize);

 /// \returns Maximum number of VGPRs that meets given number of waves per
 /// execution unit requirement for given subtarget \p STI.
-unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
+                        unsigned DynamicVGPRBlockSize);

 /// \returns Number of waves reachable for a given \p NumVGPRs usage for given
 /// subtarget \p STI.
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
-                                      unsigned NumVGPRs);
+                                      unsigned NumVGPRs,
+                                      unsigned DynamicVGPRBlockSize);

 /// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
 /// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
@ -361,6 +365,7 @@ unsigned getEncodedNumVGPRBlocks(
 /// subtarget \p STI when \p NumVGPRs are used.
 unsigned getAllocatedNumVGPRBlocks(
    const MCSubtargetInfo *STI, unsigned NumVGPRs,
+    unsigned DynamicVGPRBlockSize,
    std::optional<bool> EnableWavefrontSize32 = std::nullopt);

 } // end namespace IsaInfo
@ -1305,6 +1310,12 @@ bool getHasColorExport(const Function &F);

 bool getHasDepthExport(const Function &F);

+bool hasDynamicVGPR(const Function &F);
+
+// Returns the value of the "amdgpu-dynamic-vgpr-block-size" attribute, or 0 if
+// the attribute is missing or its value is invalid.
+unsigned getDynamicVGPRBlockSize(const Function &F);
+
 LLVM_READNONE
 constexpr bool isShader(CallingConv::ID CC) {
  switch (CC) {
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s

 ; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.

@ -137,7 +137,7 @@ define amdgpu_cs void @with_calls_no_inline_const() #0 {
  ret void
 }

-define amdgpu_cs void @with_spills() {
+define amdgpu_cs void @with_spills() #0 {
 ; CHECK-LABEL: with_spills:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
@ -366,6 +366,6 @@ define void @default() #0 {

 declare amdgpu_gfx void @callee(i32) #0

-attributes #0 = { nounwind }
-attributes #1 = { nounwind "frame-pointer"="none" }
-attributes #2 = { nounwind "frame-pointer"="all" }
+attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
+attributes #1 = { nounwind "frame-pointer"="none" "amdgpu-dynamic-vgpr-block-size"="16" }
+attributes #2 = { nounwind "frame-pointer"="all" "amdgpu-dynamic-vgpr-block-size"="16" }
--- a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s

 ; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.

@ -68,5 +68,5 @@ define void @default() #0 {

 declare amdgpu_gfx void @callee(i32) #0

-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size" = "16" }

--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
@ -0,0 +1,305 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK:           .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT:  - .api:            Vulkan
+; CHECK-NEXT:    .compute_registers:
+; CHECK-NEXT:      .dynamic_vgpr_en:   true
+; CHECK-NEXT:      .tg_size_en:     true
+; CHECK-NEXT:      .tgid_x_en:      false
+; CHECK-NEXT:      .tgid_y_en:      false
+; CHECK-NEXT:      .tgid_z_en:      false
+; CHECK-NEXT:      .tidig_comp_cnt: 0x1
+; CHECK-NEXT:    .hardware_stages:
+; CHECK-NEXT:      .cs:
+; CHECK-NEXT:        .checksum_value: 0x9444d7d0
+; CHECK-NEXT:        .debug_mode:     0
+; CHECK-NEXT:        .excp_en:        0
+; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .image_op:       false
+; CHECK-NEXT:        .lds_size:       0x200
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .sgpr_limit:     0x6a
+; CHECK-NEXT:        .threadgroup_dimensions:
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:          - 0x400
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:        .trap_present:   false
+; CHECK-NEXT:        .user_data_reg_map:
+; CHECK-NEXT:          - 0x10000000
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:        .user_sgprs:     0x3
+; CHECK-NEXT:        .vgpr_limit:     0x100
+; CHECK-NEXT:        .wavefront_size: 0x40
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK:    .registers:      {}
+; CHECK-NEXT:    .shader_functions:
+; CHECK-NEXT:      dynamic_stack:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:      dynamic_stack_loop:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x3
+; CHECK-NEXT:      multiple_stack:
+; CHECK-NEXT:        .backend_stack_size: 0x24
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x24
+; CHECK-NEXT:        .vgpr_count:     0x3
+; CHECK-NEXT:      no_stack:
+; CHECK-NEXT:        .backend_stack_size: 0
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:      no_stack_call:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x3
+; CHECK-NEXT:      no_stack_extern_call:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x24
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x58
+; CHECK-NEXT:      no_stack_extern_call_many_args:
+; CHECK-NEXT:        .backend_stack_size: 0x90
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x24
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x90
+; CHECK-NEXT:        .vgpr_count:     0x58
+; CHECK-NEXT:      no_stack_indirect_call:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x24
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x58
+; CHECK-NEXT:      simple_lds:
+; CHECK-NEXT:        .backend_stack_size: 0
+; CHECK-NEXT:        .lds_size:       0x100
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:      simple_lds_recurse:
+; CHECK-NEXT:        .backend_stack_size: 0x10
+; CHECK-NEXT:        .lds_size:       0x100
+; CHECK-NEXT:        .sgpr_count:     0x24
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT:        .vgpr_count:     0x29
+; CHECK-NEXT:      simple_stack:
+; CHECK-NEXT:        .backend_stack_size: 0x14
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x14
+; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:      simple_stack_call:
+; CHECK-NEXT:        .backend_stack_size: 0x20
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT:        .vgpr_count:     0x4
+; CHECK-NEXT:      simple_stack_extern_call:
+; CHECK-NEXT:        .backend_stack_size: 0x20
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x24
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT:        .vgpr_count:     0x58
+; CHECK-NEXT:      simple_stack_indirect_call:
+; CHECK-NEXT:        .backend_stack_size: 0x20
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x24
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT:        .vgpr_count:     0x58
+; CHECK-NEXT:      simple_stack_recurse:
+; CHECK-NEXT:        .backend_stack_size: 0x20
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .sgpr_count:     0x24
+; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT:        .vgpr_count:     0x2a
+; CHECK:amdpal.version:
+; CHECK-NEXT:  - 0x3
+; CHECK-NEXT:  - 0
+; CHECK-NEXT:...
+; CHECK-NEXT:        .end_amdgpu_pal_metadata
+
+declare amdgpu_gfx float @extern_func(float) #0
+declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
+
+@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define amdgpu_gfx float @no_stack(float %arg0) #0 {
+  %add = fadd float %arg0, 1.0
+  ret float %add
+}
+
+define amdgpu_gfx float @simple_stack(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %add = fadd float %arg0, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %add = fadd float %arg0, %val
+  %stack2 = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack2
+  %val2 = load volatile float, ptr addrspace(5) %stack2
+  %add2 = fadd float %add, %val2
+  ret float %add2
+}
+
+define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
+bb0:
+  %cmp = fcmp ogt float %arg0, 0.0
+  br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %add = fadd float %arg0, %val
+  br label %bb2
+
+bb2:
+  %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
+  ret float %res
+}
+
+define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
+bb0:
+  br label %bb1
+
+bb1:
+  %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %add = fadd float %arg0, %val
+  %cmp = icmp sgt i32 %ctr, 0
+  %newctr = sub i32 %ctr, 1
+  br i1 %cmp, label %bb1, label %bb2
+
+bb2:
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
+  %res = call amdgpu_gfx float @simple_stack(float %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %res = call amdgpu_gfx float @simple_stack(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
+  %res = call amdgpu_gfx float @extern_func(float %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %res = call amdgpu_gfx float @extern_func(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
+  %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
+  ret float %res
+}
+
+define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
+  %fptr = load ptr, ptr addrspace(4) @funcptr
+  call amdgpu_gfx void %fptr()
+  ret float %arg0
+}
+
+define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %fptr = load ptr, ptr addrspace(4) @funcptr
+  call amdgpu_gfx void %fptr()
+  %add = fadd float %arg0, %val
+  ret float %add
+}
+
+define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
+  %stack = alloca float, i32 4, align 4, addrspace(5)
+  store volatile float 2.0, ptr addrspace(5) %stack
+  %val = load volatile float, ptr addrspace(5) %stack
+  %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
+  %add = fadd float %res, %val
+  ret float %add
+}
+
+@lds = internal addrspace(3) global [64 x float] poison
+
+define amdgpu_gfx float @simple_lds(float %arg0) #0 {
+  %val = load float, ptr addrspace(3) @lds
+  ret float %val
+}
+
+define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
+  %val = load float, ptr addrspace(3) @lds
+  %res = call amdgpu_gfx float @simple_lds_recurse(float %val)
+  ret float %res
+}
+
+attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
@ -0,0 +1,205 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s
+
+; CHECK-LABEL: {{^}}_amdgpu_cs_main:
+; CHECK: ; TotalNumSgprs: 34
+; CHECK: ; NumVgprs: 2
+; CHECK:           .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT:   - .api:            Vulkan
+; CHECK-NEXT:     .compute_registers:
+; CHECK-NEXT:       .dynamic_vgpr_en:   true
+; CHECK-NEXT:       .tg_size_en:     true
+; CHECK-NEXT:       .tgid_x_en:      false
+; CHECK-NEXT:       .tgid_y_en:      false
+; CHECK-NEXT:       .tgid_z_en:      false
+; CHECK-NEXT:       .tidig_comp_cnt: 0x1
+; CHECK-NEXT:     .graphics_registers:
+; CHECK-NEXT:      .ps_extra_lds_size: 0
+; CHECK-NEXT:      .spi_ps_input_addr:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: true
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: true
+; CHECK-NEXT:        .linear_centroid_ena: true
+; CHECK-NEXT:        .linear_sample_ena: true
+; CHECK-NEXT:        .persp_center_ena: true
+; CHECK-NEXT:        .persp_centroid_ena: true
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: true
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:      .spi_ps_input_ena:
+; CHECK-NEXT:        .ancillary_ena:  false
+; CHECK-NEXT:        .front_face_ena: false
+; CHECK-NEXT:        .line_stipple_tex_ena: false
+; CHECK-NEXT:        .linear_center_ena: false
+; CHECK-NEXT:        .linear_centroid_ena: false
+; CHECK-NEXT:        .linear_sample_ena: false
+; CHECK-NEXT:        .persp_center_ena: false
+; CHECK-NEXT:        .persp_centroid_ena: false
+; CHECK-NEXT:        .persp_pull_model_ena: false
+; CHECK-NEXT:        .persp_sample_ena: true
+; CHECK-NEXT:        .pos_fixed_pt_ena: false
+; CHECK-NEXT:        .pos_w_float_ena: false
+; CHECK-NEXT:        .pos_x_float_ena: false
+; CHECK-NEXT:        .pos_y_float_ena: false
+; CHECK-NEXT:        .pos_z_float_ena: false
+; CHECK-NEXT:        .sample_coverage_ena: false
+; CHECK-NEXT:    .hardware_stages:
+; CHECK-NEXT:      .cs:
+; CHECK-NEXT:        .checksum_value: 0x9444d7d0
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .dynamic_vgpr_saved_count: 0x70
+; CHECK-NEXT:        .entry_point:    _amdgpu_cs
+; CHECK-NEXT:        .entry_point_symbol:    _amdgpu_cs_main
+; CHECK-NEXT:        .excp_en:        0
+; CHECK-NEXT:        .float_mode:     0xc0
+; CHECK-NEXT:        .image_op:       false
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x22
+; CHECK-NEXT:        .sgpr_limit:     0x6a
+; CHECK-NEXT:        .threadgroup_dimensions:
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:          - 0x400
+; CHECK-NEXT:          - 0x1
+; CHECK-NEXT:        .trap_present:   false
+; CHECK-NEXT:        .user_data_reg_map:
+; CHECK-NEXT:          - 0x10000000
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:          - 0xffffffff
+; CHECK-NEXT:        .user_sgprs:     0x3
+; CHECK-NEXT:        .vgpr_count:     0x2
+; CHECK-NEXT:        .vgpr_limit:     0x100
+; CHECK-NEXT:        .wavefront_size: 0x40
+; CHECK-NEXT:        .wgp_mode:       false
+; CHECK-NEXT:      .gs:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_gs
+; CHECK-NEXT:        .entry_point_symbol:    gs_shader
+; CHECK-NEXT:        .lds_size:       0x200
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK-NEXT:      .hs:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_hs
+; CHECK-NEXT:        .entry_point_symbol:    hs_shader
+; CHECK-NEXT:        .lds_size:       0x1000
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK-NEXT:      .ps:
+; CHECK-NEXT:        .debug_mode:     false
+; CHECK-NEXT:        .entry_point:    _amdgpu_ps
+; CHECK-NEXT:        .entry_point_symbol:    ps_shader
+; CHECK-NEXT:        .lds_size:       0
+; CHECK-NEXT:        .mem_ordered:    true
+; CHECK-NEXT:        .scratch_en:     false
+; CHECK-NEXT:        .scratch_memory_size: 0
+; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .vgpr_count:     0x1
+; CHECK-NEXT:        .wgp_mode:       true
+; CHECK:    .registers:      {}
+; CHECK:amdpal.version:
+; CHECK-NEXT:  - 0x3
+; CHECK-NEXT:  - 0
+; CHECK-NEXT:...
+; CHECK-NEXT:        .end_amdgpu_pal_metadata
+
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
+.entry:
+  %i = call i64 @llvm.amdgcn.s.getpc()
+  %i1 = and i64 %i, -4294967296
+  %i2 = zext i32 %arg1 to i64
+  %i3 = or i64 %i1, %i2
+  %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+  %i5 = and i32 %arg2, 1023
+  %i6 = lshr i32 %arg2, 10
+  %i7 = and i32 %i6, 1023
+  %i8 = add nuw nsw i32 %i7, %i5
+  %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+  %.idx = shl nuw nsw i32 %i8, 2
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
+  ret void
+}
+
+define dllexport amdgpu_ps void @ps_shader() #1 {
+  ret void
+}
+
+@LDS.GS = external addrspace(3) global [1 x i32], align 4
+
+define dllexport amdgpu_gs void @gs_shader() #2 {
+  %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
+  store i32 0, ptr addrspace(3) %ptr, align 4
+  ret void
+}
+
+@LDS.HS = external addrspace(3) global [1024 x i32], align 4
+
+define dllexport amdgpu_hs void @hs_shader() #2 {
+  %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
+  store i32 0, ptr addrspace(3) %ptr, align 4
+  ret void
+}
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.amdgcn.s.getpc() #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
+
+attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode" "amdgpu-dynamic-vgpr-block-size"="16" }
+
+attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12-dvgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12-dvgpr.mir
@ -0,0 +1,340 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_ps void @tbuffer_store1() #0 { ret void }
+  define amdgpu_ps void @tbuffer_store2() #0 { ret void }
+  define amdgpu_ps void @flat_store() #0 { ret void }
+  define amdgpu_ps void @global_store() #0 { ret void }
+  define amdgpu_ps void @buffer_store_format() #0 { ret void }
+  define amdgpu_ps void @ds_write_b32() #0 { ret void }
+  define amdgpu_ps void @global_store_dword() #0 { ret void }
+  define amdgpu_ps void @multiple_basic_blocks1() #0 { ret void }
+  define amdgpu_ps void @multiple_basic_blocks2() #0 { ret void }
+  define amdgpu_ps void @multiple_basic_blocks3() #0 { ret void }
+  define amdgpu_ps void @recursive_loop() #0 { ret void }
+  define amdgpu_ps void @recursive_loop_vmem() #0 { ret void }
+  define amdgpu_ps void @image_store() #0 { ret void }
+  define amdgpu_ps void @scratch_store() #0 { ret void }
+  define amdgpu_ps void @buffer_atomic() #0 { ret void }
+  define amdgpu_ps void @flat_atomic() #0 { ret void }
+  define amdgpu_ps void @global_atomic() #0 { ret void }
+  define amdgpu_ps void @image_atomic() #0 { ret void }
+  define amdgpu_ps void @global_store_optnone() #1 { ret void }
+
+  attributes #0 = { "amdgpu-dynamic-vgpr-block-size" = "16" }
+  attributes #1 = { "amdgpu-dynamic-vgpr-block-size" = "16" noinline optnone }
+...
+
+---
+name:            tbuffer_store1
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: tbuffer_store1
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            tbuffer_store2
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: tbuffer_store2
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
+    S_ENDPGM 0
+...
+
+---
+name:            flat_store
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: flat_store
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+
+---
+name:            global_store
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: global_store
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    S_WAIT_STORECNT 0
+    S_ENDPGM 0
+...
+
+---
+name:            buffer_store_format
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: buffer_store_format
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            ds_write_b32
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: ds_write_b32
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0 = IMPLICIT_DEF
+    renamable $vgpr1 = IMPLICIT_DEF
+    DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0
+    S_ENDPGM 0
+
+...
+---
+name:            global_store_dword
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: global_store_dword
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
+    GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            multiple_basic_blocks1
+body:             |
+  ; CHECK-LABEL: name: multiple_basic_blocks1
+  ; CHECK-NOT: S_SENDMSG 3
+  ; CHECK: S_ALLOC_VGPR 0
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+
+---
+name:            multiple_basic_blocks2
+body:             |
+  ; CHECK-LABEL: name: multiple_basic_blocks2
+  ; CHECK: bb.2:
+  ; CHECK-NOT: S_SENDMSG 3
+  ; CHECK: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  bb.0:
+    successors: %bb.2
+
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name:            multiple_basic_blocks3
+body:             |
+  ; CHECK-LABEL: name: multiple_basic_blocks3
+  ; CHECK: bb.4:
+  ; CHECK-NOT: S_SENDMSG 3
+  ; CHECK: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  bb.0:
+    successors: %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.4
+
+    S_BRANCH %bb.4
+
+  bb.3:
+    successors: %bb.4
+
+    $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.4:
+    S_ENDPGM 0
+...
+
+---
+name:            recursive_loop
+body:             |
+  ; CHECK-LABEL: name: recursive_loop
+  ; CHECK-NOT: S_SENDMSG 3
+  ; CHECK: S_ALLOC_VGPR 0
+  ; CHECK:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name:            recursive_loop_vmem
+body:             |
+  ; CHECK-LABEL: name: recursive_loop_vmem
+  ; CHECK-NOT: S_SENDMSG 3
+  ; CHECK: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+
+    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
+    S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name:            image_store
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: image_store
+  ; CHECK-NOT: S_SENDMSG 3
+  ; CHECK: S_ALLOC_VGPR 0
+  ; CHECK: S_ENDPGM 0
+  IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
+  S_ENDPGM 0
+...
+
+---
+name:            scratch_store
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: scratch_store
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
+    SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+
+---
+name:            buffer_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: buffer_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
+    S_ENDPGM 0
+...
+
+---
+name:            flat_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: flat_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
+    S_ENDPGM 0
+...
+
+
+---
+name:            global_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: global_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            image_atomic
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: image_atomic
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK: S_ALLOC_VGPR 0
+    ; CHECK: S_ENDPGM 0
+    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
+    S_ENDPGM 0
+...
+
+---
+name:            global_store_optnone
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: global_store_optnone
+    ; CHECK-NOT: S_SENDMSG 3
+    ; CHECK-NOT: S_ALLOC_VGPR
+    ; CHECK: S_ENDPGM 0
+    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    S_WAIT_STORECNT 0
+    S_ENDPGM 0
+...
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
@ -1,5 +1,4 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
-# RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s

 --- |
  define amdgpu_ps void @tbuffer_store1() { ret void }
@ -29,8 +28,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: tbuffer_store1
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
    S_ENDPGM 0
@ -42,8 +40,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: tbuffer_store2
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
    S_ENDPGM 0
@ -55,8 +52,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: flat_store
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
    S_ENDPGM 0
@ -68,8 +64,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: global_store
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
    S_WAIT_STORECNT 0
@ -82,8 +77,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: buffer_store_format
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
    S_ENDPGM 0
@ -95,8 +89,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: ds_write_b32
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0 = IMPLICIT_DEF
    renamable $vgpr1 = IMPLICIT_DEF
@ -112,8 +105,7 @@ body:             |

    ; CHECK-LABEL: name: global_store_dword
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
    GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
@ -125,8 +117,7 @@ name:            multiple_basic_blocks1
 body:             |
  ; CHECK-LABEL: name: multiple_basic_blocks1
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
-  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; CHECK:   S_ENDPGM 0
  bb.0:
    successors: %bb.1
@ -153,8 +144,7 @@ body:             |
  ; CHECK-LABEL: name: multiple_basic_blocks2
  ; CHECK: bb.2:
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
-  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; CHECK: S_ENDPGM 0
  bb.0:
    successors: %bb.2
@ -180,8 +170,7 @@ body:             |
  ; CHECK-LABEL: name: multiple_basic_blocks3
  ; CHECK: bb.4:
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
-  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; CHECK: S_ENDPGM 0
  bb.0:
    successors: %bb.2
@ -216,8 +205,7 @@ name:            recursive_loop
 body:             |
  ; CHECK-LABEL: name: recursive_loop
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
-  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; CHECK:   S_ENDPGM 0
  bb.0:
    successors: %bb.1
@ -241,8 +229,7 @@ name:            recursive_loop_vmem
 body:             |
  ; CHECK-LABEL: name: recursive_loop_vmem
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
-  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; CHECK: S_ENDPGM 0
  bb.0:
    successors: %bb.1
@ -268,8 +255,7 @@ body:             |
  bb.0:
  ; CHECK-LABEL: name: image_store
  ; CHECK-NOT: S_SENDMSG 3
-  ; DEFAULT-NOT: S_ALLOC_VGPR
-  ; DVGPR: S_ALLOC_VGPR 0
+  ; CHECK-NOT: S_ALLOC_VGPR
  ; CHECK: S_ENDPGM 0
  IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
  S_ENDPGM 0
@ -281,8 +267,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: scratch_store
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
    SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
@ -295,8 +280,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: buffer_atomic
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
    S_ENDPGM 0
@ -308,8 +292,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: flat_atomic
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
    S_ENDPGM 0
@ -322,8 +305,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: global_atomic
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
    S_ENDPGM 0
@ -335,8 +317,7 @@ body:             |
  bb.0:
    ; CHECK-LABEL: name: image_atomic
    ; CHECK-NOT: S_SENDMSG 3
-    ; DEFAULT-NOT: S_ALLOC_VGPR
-    ; DVGPR: S_ALLOC_VGPR 0
+    ; CHECK-NOT: S_ALLOC_VGPR
    ; CHECK: S_ENDPGM 0
    renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
    S_ENDPGM 0
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@ -44,6 +44,7 @@
 ; CHECK-NEXT:   sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT:   longBranchReservedReg: ''
 ; CHECK-NEXT:   hasInitWholeWave: false
+; CHECK-NEXT:   dynamicVGPRBlockSize: 0
 ; CHECK-NEXT:   scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
  define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
@ -312,6 +313,7 @@
 ; CHECK-NEXT:   sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT:   longBranchReservedReg: ''
 ; CHECK-NEXT:   hasInitWholeWave: false
+; CHECK-NEXT:   dynamicVGPRBlockSize: 0
 ; CHECK-NEXT:   scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
  define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@ -44,6 +44,7 @@
 ; AFTER-PEI-NEXT: sgprForEXECCopy: ''
 ; AFTER-PEI-NEXT: longBranchReservedReg: ''
 ; AFTER-PEI-NEXT: hasInitWholeWave: false
+; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0
 ; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
 ; AFTER-PEI-NEXT: body:
 define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@ -44,6 +44,7 @@
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
  define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@ -44,6 +44,7 @@
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@ -53,6 +53,7 @@
 # FULL-NEXT:  sgprForEXECCopy: ''
 # FULL-NEXT:  longBranchReservedReg: ''
 # FULL-NEXT:  hasInitWholeWave: false
+# FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
 # FULL-NEXT: body:

@ -159,6 +160,7 @@ body:             |
 # FULL-NEXT: sgprForEXECCopy: ''
 # FULL-NEXT: longBranchReservedReg: ''
 # FULL-NEXT: hasInitWholeWave: false
+# FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
 # FULL-NEXT: body:

@ -236,6 +238,7 @@ body:             |
 # FULL-NEXT: sgprForEXECCopy: ''
 # FULL-NEXT: longBranchReservedReg: ''
 # FULL-NEXT: hasInitWholeWave: false
+# FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
 # FULL-NEXT: body:

@ -314,6 +317,7 @@ body:             |
 # FULL-NEXT: sgprForEXECCopy: ''
 # FULL-NEXT: longBranchReservedReg: ''
 # FULL-NEXT: hasInitWholeWave: false
+# FULL-NEXT: dynamicVGPRBlockSize: 0
 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
 # FULL-NEXT: body:

--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@ -54,6 +54,7 @@
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
@ -102,6 +103,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
 define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
@ -174,6 +176,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
 define void @function() {
@ -228,6 +231,7 @@ define void @function() {
 ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: ''
 ; CHECK-NEXT: hasInitWholeWave: false
+; CHECK-NEXT: dynamicVGPRBlockSize: 0
 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
 ; CHECK-NEXT: body:
 define void @function_nsz() #0 {
--- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@ -93,16 +93,16 @@ static const std::pair<StringRef, StringRef>
  W32FS = {"+wavefrontsize32", "w32"},
  W64FS = {"+wavefrontsize64", "w64"};

-using TestFuncTy =
-    function_ref<bool(std::stringstream &, unsigned, const GCNSubtarget &)>;
+using TestFuncTy = function_ref<bool(std::stringstream &, unsigned,
+                                     const GCNSubtarget &, bool)>;

 static bool testAndRecord(std::stringstream &Table, const GCNSubtarget &ST,
-                          TestFuncTy test) {
+                          TestFuncTy test, unsigned DynamicVGPRBlockSize) {
  bool Success = true;
  unsigned MaxOcc = ST.getMaxWavesPerEU();
  for (unsigned Occ = MaxOcc; Occ > 0; --Occ) {
    Table << std::right << std::setw(3) << Occ << "    ";
-    Success = test(Table, Occ, ST) && Success;
+    Success = test(Table, Occ, ST, DynamicVGPRBlockSize) && Success;
    Table << '\n';
  }
  return Success;
@ -132,7 +132,7 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
        FS = &W32FS;

      std::stringstream Table;
-      bool Success = testAndRecord(Table, ST, test);
+      bool Success = testAndRecord(Table, ST, test, /*DynamicVGPRBlockSize=*/0);
      if (!Success || PrintCpuRegLimits)
        TablePerCPUs[Table.str()].push_back((CanonCPUName + FS->second).str());

@ -155,40 +155,50 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,

 static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
                                  TestFuncTy test) {
-  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
-                                      "+dynamic-vgpr," + FS.str());
+  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
  ASSERT_TRUE(TM) << "No target machine";

  GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
                  std::string(TM->getTargetFeatureString()), *TM);
-  ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));

-  std::stringstream Table;
-  bool Success = testAndRecord(Table, ST, test);
-  EXPECT_TRUE(Success && !PrintCpuRegLimits)
-      << CPUName << " dynamic VGPR " << FS
-      << ":\nOcc    MinVGPR        MaxVGPR\n"
-      << Table.str() << '\n';
+  auto testWithBlockSize = [&](unsigned DynamicVGPRBlockSize) {
+    std::stringstream Table;
+    bool Success = testAndRecord(Table, ST, test, DynamicVGPRBlockSize);
+    EXPECT_TRUE(Success && !PrintCpuRegLimits)
+        << CPUName << " dynamic VGPR block size " << DynamicVGPRBlockSize
+        << ":\nOcc    MinVGPR        MaxVGPR\n"
+        << Table.str() << '\n';
+  };
+
+  testWithBlockSize(16);
+  testWithBlockSize(32);
 }

 TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
-  auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
-    unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
+  auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST,
+                 unsigned DynamicVGPRBlockSize) {
+    unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(DynamicVGPRBlockSize);
    return checkMinMax(
-        OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum), ST.getMaxWavesPerEU(),
-        [&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs); },
-        [&](unsigned Occ) { return ST.getMinNumVGPRs(Occ); },
-        [&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ); });
+        OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum, DynamicVGPRBlockSize),
+        ST.getMaxWavesPerEU(),
+        [&](unsigned NumGPRs) {
+          return ST.getOccupancyWithNumVGPRs(NumGPRs, DynamicVGPRBlockSize);
+        },
+        [&](unsigned Occ) {
+          return ST.getMinNumVGPRs(Occ, DynamicVGPRBlockSize);
+        },
+        [&](unsigned Occ) {
+          return ST.getMaxNumVGPRs(Occ, DynamicVGPRBlockSize);
+        });
  };

  testGPRLimits("VGPR", true, test);

  testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
-  testDynamicVGPRLimits("gfx1200",
-                        "+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
 }

 static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
+                               unsigned DynamicVGPRBlockSize,
                               unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
                               unsigned ExpectedMaxVGPRs) {
  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
@ -206,11 +216,15 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
  Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
  Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");

+  std::string DVGPRBlockSize = std::to_string(DynamicVGPRBlockSize);
+  if (DynamicVGPRBlockSize)
+    Func->addFnAttr("amdgpu-dynamic-vgpr-block-size", DVGPRBlockSize);
+
  auto Range = ST.getWavesPerEU(*Func);
  EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
  EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
  EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
-  EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
+  EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs(DynamicVGPRBlockSize))
      << CPUName << ' ' << FS;

  // Function with requested 'amdgpu-waves-per-eu' in a valid range.
@ -221,11 +235,10 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
 }

 TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
-  testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
-  testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
-  testAbsoluteLimits(
-      "gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
-      1, 16, 256);
+  // CPUName, Features, DynamicVGPRBlockSize; Expected MinOcc, MaxOcc, MaxVGPRs
+  testAbsoluteLimits("gfx1200", "+wavefrontsize32", 0, 1, 16, 256);
+  testAbsoluteLimits("gfx1200", "+wavefrontsize32", 16, 1, 16, 128);
+  testAbsoluteLimits("gfx1200", "+wavefrontsize32", 32, 1, 16, 256);
 }

 static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {