[AMDGPU] Replace dynamic VGPR feature with attribute (#133444)

Use a function attribute (amdgpu-dynamic-vgpr) instead of a subtarget
feature, as requested in #130030.
This commit is contained in:
Diana Picus 2025-06-24 11:09:36 +02:00 committed by GitHub
parent 6cfa03f1f1
commit a201f8872a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
34 changed files with 1205 additions and 199 deletions

View File

@ -768,11 +768,6 @@ For example:
performant than code generated for XNACK replay performant than code generated for XNACK replay
disabled. disabled.
dynamic-vgpr TODO Represents the "Dynamic VGPR" hardware mode, introduced in GFX12.
Waves launched in this mode may allocate or deallocate the VGPRs
using dedicated instructions, but may not send the DEALLOC_VGPRS
message.
=============== ============================ ================================================== =============== ============================ ==================================================
.. _amdgpu-target-id: .. _amdgpu-target-id:
@ -1764,6 +1759,15 @@ The AMDGPU backend supports the following LLVM IR attributes.
"amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors. "amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors.
"amdgpu-dynamic-vgpr-block-size" Represents the size of a VGPR block in the "Dynamic VGPR" hardware mode,
introduced in GFX12.
A value of 0 (default) means that dynamic VGPRs are not enabled.
Valid values for GFX12+ are 16 and 32.
Waves launched in this mode may allocate or deallocate the VGPRs
using dedicated instructions, but may not send the DEALLOC_VGPRS
message. If a shader has this attribute, then all its callees must
match its value.
================================================ ========================================================== ================================================ ==========================================================
Calling Conventions Calling Conventions

View File

@ -1281,12 +1281,14 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32" "v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
>; >;
// FIXME: Remove after all users are migrated to attribute.
def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr", def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
"DynamicVGPR", "DynamicVGPR",
"true", "true",
"Enable dynamic VGPR mode" "Enable dynamic VGPR mode"
>; >;
// FIXME: Remove after all users are migrated to attribute.
def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32", def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
"DynamicVGPRBlockSize32", "DynamicVGPRBlockSize32",
"true", "true",

View File

@ -452,15 +452,17 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
unsigned MaxWaves = MFI.getMaxWavesPerEU(); unsigned MaxWaves = MFI.getMaxWavesPerEU();
uint64_t TotalNumVgpr = uint64_t TotalNumVgpr =
getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr); getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
uint64_t NumVGPRsForWavesPerEU = std::max( uint64_t NumVGPRsForWavesPerEU =
{TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)}); std::max({TotalNumVgpr, (uint64_t)1,
(uint64_t)STM.getMinNumVGPRs(
MaxWaves, MFI.getDynamicVGPRBlockSize())});
uint64_t NumSGPRsForWavesPerEU = std::max( uint64_t NumSGPRsForWavesPerEU = std::max(
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)}); {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy( const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
STM.getOccupancyWithWorkGroupSizes(*MF).second, STM.getOccupancyWithWorkGroupSizes(*MF).second,
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext), MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM, MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
OutContext); MFI.getDynamicVGPRBlockSize(), STM, OutContext);
uint64_t Occupancy; uint64_t Occupancy;
const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute( const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
@ -1082,7 +1084,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
Ctx); Ctx);
ProgInfo.NumVGPRsForWavesPerEU = ProgInfo.NumVGPRsForWavesPerEU =
AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul), AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
CreateExpr(STM.getMinNumVGPRs(MaxWaves))}, CreateExpr(STM.getMinNumVGPRs(
MaxWaves, MFI->getDynamicVGPRBlockSize()))},
Ctx); Ctx);
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
@ -1256,7 +1259,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize).second, STM.computeOccupancy(F, ProgInfo.LDSSize).second,
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
MFI->getDynamicVGPRBlockSize(), STM, Ctx);
const auto [MinWEU, MaxWEU] = const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true); AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
@ -1405,7 +1409,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
// Helper function to add common PAL Metadata 3.0+ // Helper function to add common PAL Metadata 3.0+
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
const SIProgramInfo &CurrentProgramInfo, const SIProgramInfo &CurrentProgramInfo,
CallingConv::ID CC, const GCNSubtarget &ST) { CallingConv::ID CC, const GCNSubtarget &ST,
unsigned DynamicVGPRBlockSize) {
if (ST.hasIEEEMode()) if (ST.hasIEEEMode())
MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode); MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
@ -1417,7 +1422,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
(bool)CurrentProgramInfo.TrapHandlerEnable); (bool)CurrentProgramInfo.TrapHandlerEnable);
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable); MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
if (ST.isDynamicVGPREnabled()) if (DynamicVGPRBlockSize != 0)
MD->setComputeRegisters(".dynamic_vgpr_en", true); MD->setComputeRegisters(".dynamic_vgpr_en", true);
} }
@ -1444,7 +1449,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
// For targets that support dynamic VGPRs, set the number of saved dynamic // For targets that support dynamic VGPRs, set the number of saved dynamic
// VGPRs (if any) in the PAL metadata. // VGPRs (if any) in the PAL metadata.
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.isDynamicVGPREnabled() && if (MFI->isDynamicVGPREnabled() &&
MFI->getScratchReservedForDynamicVGPRs() > 0) MFI->getScratchReservedForDynamicVGPRs() > 0)
MD->setHwStage(CC, ".dynamic_vgpr_saved_count", MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
MFI->getScratchReservedForDynamicVGPRs() / 4); MFI->getScratchReservedForDynamicVGPRs() / 4);
@ -1470,7 +1475,8 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode); MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean, MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
CurrentProgramInfo.ScratchEnable); CurrentProgramInfo.ScratchEnable);
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM); EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
MFI->getDynamicVGPRBlockSize());
} }
// ScratchSize is in bytes, 16 aligned. // ScratchSize is in bytes, 16 aligned.
@ -1541,7 +1547,9 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
MD->setRsrc2(CallingConv::AMDGPU_CS, MD->setRsrc2(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
} else { } else {
EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST); EmitPALMetadataCommon(
MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
} }
// Set optional info // Set optional info

View File

@ -173,8 +173,16 @@ static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
return 128; return 128;
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
// Temporarily check both the attribute and the subtarget feature, until the
// latter is removed.
if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
unsigned MaxVGPRs = ST.getMaxNumVGPRs( unsigned MaxVGPRs = ST.getMaxNumVGPRs(
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first); ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
DynamicVGPRBlockSize);
// A non-entry function has only 32 caller preserved registers. // A non-entry function has only 32 caller preserved registers.
// Do not promote alloca which will force spilling unless we know the function // Do not promote alloca which will force spilling unless we know the function

View File

@ -448,7 +448,10 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// TODO: assert Regions are sorted descending by pressure // TODO: assert Regions are sorted descending by pressure
const auto &ST = MF.getSubtarget<GCNSubtarget>(); const auto &ST = MF.getSubtarget<GCNSubtarget>();
const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); const unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
const auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
<< ", current = " << Occ << '\n'); << ", current = " << Occ << '\n');
@ -457,7 +460,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// Always build the DAG to add mutations // Always build the DAG to add mutations
BuildDAG DAG(*R, *this); BuildDAG DAG(*R, *this);
if (R->MaxPressure.getOccupancy(ST) >= NewOcc) if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc)
continue; continue;
LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
@ -468,7 +471,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n"; LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
printSchedRP(dbgs(), R->MaxPressure, MaxRP)); printSchedRP(dbgs(), R->MaxPressure, MaxRP));
NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST)); NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize));
if (NewOcc <= Occ) if (NewOcc <= Occ)
break; break;
@ -489,9 +492,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const auto &ST = MF.getSubtarget<GCNSubtarget>(); const auto &ST = MF.getSubtarget<GCNSubtarget>();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy(); auto TgtOcc = MFI->getMinAllowedOccupancy();
unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
sortRegionsByPressure(TgtOcc); sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
bool IsReentry = false; bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) { if (TryMaximizeOccupancy && Occ < TgtOcc) {
@ -522,19 +527,21 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const auto RP = getRegionPressure(*R); const auto RP = getRegionPressure(*R);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
if (RP.getOccupancy(ST) < TgtOcc) { if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() && if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) { ST, DynamicVGPRBlockSize) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n"); LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R); scheduleBest(*R);
} else { } else {
LLVM_DEBUG(dbgs() << ", restoring\n"); LLVM_DEBUG(dbgs() << ", restoring\n");
Ovr.restoreOrder(); Ovr.restoreOrder();
assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc); assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >=
TgtOcc);
} }
} }
FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST)); FinalOccupancy =
std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
} }
} }
MFI->limitOccupancy(FinalOccupancy); MFI->limitOccupancy(FinalOccupancy);
@ -580,9 +587,11 @@ void GCNIterativeScheduler::scheduleILP(
const auto &ST = MF.getSubtarget<GCNSubtarget>(); const auto &ST = MF.getSubtarget<GCNSubtarget>();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy(); auto TgtOcc = MFI->getMinAllowedOccupancy();
unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
sortRegionsByPressure(TgtOcc); sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
bool IsReentry = false; bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) { if (TryMaximizeOccupancy && Occ < TgtOcc) {
@ -603,17 +612,18 @@ void GCNIterativeScheduler::scheduleILP(
const auto RP = getSchedulePressure(*R, ILPSchedule); const auto RP = getSchedulePressure(*R, ILPSchedule);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
if (RP.getOccupancy(ST) < TgtOcc) { if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() && if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) { ST, DynamicVGPRBlockSize) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n"); LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R); scheduleBest(*R);
} }
} else { } else {
scheduleRegion(*R, ILPSchedule, RP); scheduleRegion(*R, ILPSchedule, RP);
LLVM_DEBUG(printSchedResult(dbgs(), R, RP)); LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST)); FinalOccupancy =
std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
} }
} }
MFI->limitOccupancy(FinalOccupancy); MFI->limitOccupancy(FinalOccupancy);

View File

@ -251,7 +251,9 @@ bool GCNNSAReassignImpl::run(MachineFunction &MF) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MaxNumVGPRs = ST->getMaxNumVGPRs(MF); MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs); MaxNumVGPRs = std::min(
ST->getMaxNumVGPRs(MFI->getOccupancy(), MFI->getDynamicVGPRBlockSize()),
MaxNumVGPRs);
CSRegs = MRI->getCalleeSavedRegs(); CSRegs = MRI->getCalleeSavedRegs();
using Candidate = std::pair<const MachineInstr*, bool>; using Candidate = std::pair<const MachineInstr*, bool>;

View File

@ -13,6 +13,7 @@
#include "GCNRegPressure.h" #include "GCNRegPressure.h"
#include "AMDGPU.h" #include "AMDGPU.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm; using namespace llvm;
@ -94,17 +95,20 @@ void GCNRegPressure::inc(unsigned Reg,
bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O, bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy) const { unsigned MaxOccupancy) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
const auto SGPROcc = std::min(MaxOccupancy, const auto SGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(getSGPRNum())); ST.getOccupancyWithNumSGPRs(getSGPRNum()));
const auto VGPROcc = const auto VGPROcc = std::min(
std::min(MaxOccupancy, MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()))); DynamicVGPRBlockSize));
const auto OtherSGPROcc = std::min(MaxOccupancy, const auto OtherSGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(O.getSGPRNum())); ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
const auto OtherVGPROcc = const auto OtherVGPROcc =
std::min(MaxOccupancy, std::min(MaxOccupancy,
ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()))); ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()),
DynamicVGPRBlockSize));
const auto Occ = std::min(SGPROcc, VGPROcc); const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@ -226,13 +230,15 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
O.getVGPRNum(ST.hasGFX90AInsts())); O.getVGPRNum(ST.hasGFX90AInsts()));
} }
Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) { Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
return Printable([&RP, ST](raw_ostream &OS) { unsigned DynamicVGPRBlockSize) {
return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
OS << "VGPRs: " << RP.getArchVGPRNum() << ' ' OS << "VGPRs: " << RP.getArchVGPRNum() << ' '
<< "AGPRs: " << RP.getAGPRNum(); << "AGPRs: " << RP.getAGPRNum();
if (ST) if (ST)
OS << "(O" OS << "(O"
<< ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts())) << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()),
DynamicVGPRBlockSize)
<< ')'; << ')';
OS << ", SGPRs: " << RP.getSGPRNum(); OS << ", SGPRs: " << RP.getSGPRNum();
if (ST) if (ST)
@ -240,7 +246,7 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight() OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
<< ", LSGPR WT: " << RP.getSGPRTuplesWeight(); << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
if (ST) if (ST)
OS << " -> Occ: " << RP.getOccupancy(*ST); OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
OS << '\n'; OS << '\n';
}); });
} }

View File

@ -69,9 +69,11 @@ struct GCNRegPressure {
} }
unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; } unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
unsigned getOccupancy(const GCNSubtarget &ST) const { unsigned getOccupancy(const GCNSubtarget &ST,
unsigned DynamicVGPRBlockSize) const {
return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()))); ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
DynamicVGPRBlockSize));
} }
void inc(unsigned Reg, void inc(unsigned Reg,
@ -79,8 +81,10 @@ struct GCNRegPressure {
LaneBitmask NewMask, LaneBitmask NewMask,
const MachineRegisterInfo &MRI); const MachineRegisterInfo &MRI);
bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const { bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O,
return getOccupancy(ST) > O.getOccupancy(ST); unsigned DynamicVGPRBlockSize) const {
return getOccupancy(ST, DynamicVGPRBlockSize) >
O.getOccupancy(ST, DynamicVGPRBlockSize);
} }
/// Compares \p this GCNRegpressure to \p O, returning true if \p this is /// Compares \p this GCNRegpressure to \p O, returning true if \p this is
@ -133,7 +137,8 @@ private:
friend GCNRegPressure max(const GCNRegPressure &P1, friend GCNRegPressure max(const GCNRegPressure &P1,
const GCNRegPressure &P2); const GCNRegPressure &P2);
friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST); friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST,
unsigned DynamicVGPRBlockSize);
}; };
inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) { inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
@ -402,7 +407,8 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
bool isEqual(const GCNRPTracker::LiveRegSet &S1, bool isEqual(const GCNRPTracker::LiveRegSet &S1,
const GCNRPTracker::LiveRegSet &S2); const GCNRPTracker::LiveRegSet &S2);
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr); Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr,
unsigned DynamicVGPRBlockSize = 0);
Printable print(const GCNRPTracker::LiveRegSet &LiveRegs, Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
const MachineRegisterInfo &MRI); const MachineRegisterInfo &MRI);

View File

@ -99,17 +99,20 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
if (!KnownExcessRP) { if (!KnownExcessRP) {
VGPRCriticalLimit = VGPRCriticalLimit = std::min(
std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit); ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
VGPRExcessLimit);
} else { } else {
// This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
// returns a reasonably small number for targets with lots of VGPRs, such // returns a reasonably small number for targets with lots of VGPRs, such
// as GFX10 and GFX11. // as GFX10 and GFX11.
LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
"VGPRCriticalLimit calculation method.\n"); "VGPRCriticalLimit calculation method.\n");
unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST); unsigned Granule =
unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST); AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
unsigned Addressable =
AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule); unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
VGPRBudget = std::max(VGPRBudget, Granule); VGPRBudget = std::max(VGPRBudget, Granule);
VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit); VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
@ -1136,7 +1139,8 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
if (DAG.MinOccupancy > InitialOccupancy) { if (DAG.MinOccupancy > InitialOccupancy) {
for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX) for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
DAG.RegionsWithMinOcc[IDX] = DAG.RegionsWithMinOcc[IDX] =
DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy; DAG.Pressure[IDX].getOccupancy(
DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << StageID LLVM_DEBUG(dbgs() << StageID
<< " stage successfully increased occupancy to " << " stage successfully increased occupancy to "
@ -1273,11 +1277,14 @@ void GCNSchedStage::checkScheduling() {
LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
DAG.Pressure[RegionIdx] = PressureAfter; DAG.Pressure[RegionIdx] = PressureAfter;
DAG.RegionsWithMinOcc[RegionIdx] = DAG.RegionsWithMinOcc[RegionIdx] =
PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
DAG.MinOccupancy;
// Early out if we have achieved the occupancy target. // Early out if we have achieved the occupancy target.
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
@ -1286,10 +1293,10 @@ void GCNSchedStage::checkScheduling() {
unsigned TargetOccupancy = std::min( unsigned TargetOccupancy = std::min(
S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
unsigned WavesAfter = unsigned WavesAfter = std::min(
std::min(TargetOccupancy, PressureAfter.getOccupancy(ST)); TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
unsigned WavesBefore = unsigned WavesBefore = std::min(
std::min(TargetOccupancy, PressureBefore.getOccupancy(ST)); TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
<< ", after " << WavesAfter << ".\n"); << ", after " << WavesAfter << ".\n");
@ -1338,7 +1345,8 @@ void GCNSchedStage::checkScheduling() {
} else { } else {
DAG.Pressure[RegionIdx] = PressureAfter; DAG.Pressure[RegionIdx] = PressureAfter;
DAG.RegionsWithMinOcc[RegionIdx] = DAG.RegionsWithMinOcc[RegionIdx] =
PressureAfter.getOccupancy(ST) == DAG.MinOccupancy; PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
DAG.MinOccupancy;
} }
} }
@ -1461,11 +1469,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
return true; return true;
// For dynamic VGPR mode, we don't want to waste any VGPR blocks. // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
if (ST.isDynamicVGPREnabled()) { if (DAG.MFI.isDynamicVGPREnabled()) {
unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
&ST, PressureBefore.getVGPRNum(false)); &ST, DAG.MFI.getDynamicVGPRBlockSize(),
PressureBefore.getVGPRNum(false));
unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
&ST, PressureAfter.getVGPRNum(false)); &ST, DAG.MFI.getDynamicVGPRBlockSize(),
PressureAfter.getVGPRNum(false));
if (BlocksAfter > BlocksBefore) if (BlocksAfter > BlocksBefore)
return true; return true;
} }
@ -1489,7 +1499,8 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
// If RP is not reduced in the unclustered reschedule stage, revert to the // If RP is not reduced in the unclustered reschedule stage, revert to the
// old schedule. // old schedule.
if ((WavesAfter <= PressureBefore.getOccupancy(ST) && if ((WavesAfter <=
PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
mayCauseSpilling(WavesAfter)) || mayCauseSpilling(WavesAfter)) ||
GCNSchedStage::shouldRevertScheduling(WavesAfter)) { GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
@ -1511,8 +1522,9 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
ScheduleMetrics MAfter = getScheduleMetrics(DAG); ScheduleMetrics MAfter = getScheduleMetrics(DAG);
unsigned OldMetric = MBefore.getMetric(); unsigned OldMetric = MBefore.getMetric();
unsigned NewMetric = MAfter.getMetric(); unsigned NewMetric = MAfter.getMetric();
unsigned WavesBefore = unsigned WavesBefore = std::min(
std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST)); S.getTargetOccupancy(),
PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
unsigned Profit = unsigned Profit =
((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
@ -1566,7 +1578,8 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
void GCNSchedStage::revertScheduling() { void GCNSchedStage::revertScheduling() {
DAG.RegionsWithMinOcc[RegionIdx] = DAG.RegionsWithMinOcc[RegionIdx] =
PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) ==
DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
DAG.RegionEnd = DAG.RegionBegin; DAG.RegionEnd = DAG.RegionBegin;
int SkippedDebugInstr = 0; int SkippedDebugInstr = 0;
@ -1844,13 +1857,16 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
// occupancy, or regions with VGPR spilling) to a model of their excess RP. // occupancy, or regions with VGPR spilling) to a model of their excess RP.
DenseMap<unsigned, ExcessRP> OptRegions; DenseMap<unsigned, ExcessRP> OptRegions;
const Function &F = MF.getFunction(); const Function &F = MF.getFunction();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F); std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F); const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F); const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
const unsigned MaxSGPRsIncOcc = const unsigned MaxSGPRsIncOcc =
ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false); ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1); const unsigned MaxVGPRsIncOcc =
ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy; IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
auto ClearOptRegionsIf = [&](bool Cond) -> bool { auto ClearOptRegionsIf = [&](bool Cond) -> bool {
@ -2163,7 +2179,9 @@ void PreRARematStage::rematerialize() {
} }
} }
DAG.Pressure[I] = RP; DAG.Pressure[I] = RP;
AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST)); AchievedOcc = std::min(
AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
->getDynamicVGPRBlockSize()));
} }
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n"); REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
} }

View File

@ -366,8 +366,11 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
getGeneration()); getGeneration());
} }
unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { unsigned
return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
unsigned DynamicVGPRBlockSize) const {
return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs,
DynamicVGPRBlockSize);
} }
unsigned unsigned
@ -403,9 +406,15 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
std::pair<unsigned, unsigned> std::pair<unsigned, unsigned>
GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs, unsigned NumVGPRs) const { unsigned NumSGPRs, unsigned NumVGPRs) const {
unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
// Temporarily check both the attribute and the subtarget feature until the
// latter is removed.
if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F); auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs); unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs); unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
// Maximum occupancy may be further limited by high SGPR/VGPR usage. // Maximum occupancy may be further limited by high SGPR/VGPR usage.
MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc)); MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
@ -512,9 +521,16 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
} }
unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
// Temporarily check both the attribute and the subtarget feature, until the
// latter is removed.
unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
std::pair<unsigned, unsigned> Waves = getWavesPerEU(F); std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
return getBaseMaxNumVGPRs( return getBaseMaxNumVGPRs(
F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)}); F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
} }
unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {

View File

@ -1391,7 +1391,8 @@ public:
/// Return the maximum number of waves per SIMD for kernels using \p VGPRs /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
/// VGPRs /// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
unsigned DynamicVGPRBlockSize) const;
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p F, each workgroup /// be achieved when the only function running on a CU is \p F, each workgroup
@ -1549,8 +1550,8 @@ public:
unsigned getMaxNumSGPRs(const Function &F) const; unsigned getMaxNumSGPRs(const Function &F) const;
/// \returns VGPR allocation granularity supported by the subtarget. /// \returns VGPR allocation granularity supported by the subtarget.
unsigned getVGPRAllocGranule() const { unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
return AMDGPU::IsaInfo::getVGPRAllocGranule(this); return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
} }
/// \returns VGPR encoding granularity supported by the subtarget. /// \returns VGPR encoding granularity supported by the subtarget.
@ -1570,20 +1571,24 @@ public:
} }
/// \returns Addressable number of VGPRs supported by the subtarget. /// \returns Addressable number of VGPRs supported by the subtarget.
unsigned getAddressableNumVGPRs() const { unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
} }
/// \returns the minimum number of VGPRs that will prevent achieving more than /// \returns the minimum number of VGPRs that will prevent achieving more than
/// the specified number of waves \p WavesPerEU. /// the specified number of waves \p WavesPerEU.
unsigned getMinNumVGPRs(unsigned WavesPerEU) const { unsigned getMinNumVGPRs(unsigned WavesPerEU,
return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); unsigned DynamicVGPRBlockSize) const {
return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
DynamicVGPRBlockSize);
} }
/// \returns the maximum number of VGPRs that can be used and still achieved /// \returns the maximum number of VGPRs that can be used and still achieved
/// at least the specified number of waves \p WavesPerEU. /// at least the specified number of waves \p WavesPerEU.
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { unsigned getMaxNumVGPRs(unsigned WavesPerEU,
return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); unsigned DynamicVGPRBlockSize) const {
return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
DynamicVGPRBlockSize);
} }
/// \returns max num VGPRs. This is the common utility function /// \returns max num VGPRs. This is the common utility function
@ -1686,6 +1691,9 @@ public:
} }
bool isDynamicVGPREnabled() const { return DynamicVGPR; } bool isDynamicVGPREnabled() const { return DynamicVGPR; }
unsigned getDynamicVGPRBlockSize() const {
return DynamicVGPRBlockSize32 ? 32 : 16;
}
bool requiresDisjointEarlyClobberAndUndef() const override { bool requiresDisjointEarlyClobberAndUndef() const override {
// AMDGPU doesn't care if early-clobber and undef operands are allocated // AMDGPU doesn't care if early-clobber and undef operands are allocated

View File

@ -313,13 +313,11 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
/// Remove dependency on GCNSubtarget and depend only only the necessary values /// Remove dependency on GCNSubtarget and depend only only the necessary values
/// for said occupancy computation. Should match computeOccupancy implementation /// for said occupancy computation. Should match computeOccupancy implementation
/// without passing \p STM on. /// without passing \p STM on.
const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc, const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
const MCExpr *NumSGPRs, unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
const GCNSubtarget &STM,
MCContext &Ctx) {
unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM); unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM); unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM); unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
unsigned Generation = STM.getGeneration(); unsigned Generation = STM.getGeneration();

View File

@ -93,11 +93,10 @@ public:
return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx); return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
} }
static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, static const AMDGPUMCExpr *
const MCExpr *NumSGPRs, createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
const MCExpr *NumVGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
const GCNSubtarget &STM, const GCNSubtarget &STM, MCContext &Ctx);
MCContext &Ctx);
ArrayRef<const MCExpr *> getArgs() const { return Args; } ArrayRef<const MCExpr *> getArgs() const { return Args; }
VariantKind getKind() const { return Kind; } VariantKind getKind() const { return Kind; }

View File

@ -197,7 +197,9 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
// pointer becomes dead and could otherwise be reused for destination. // pointer becomes dead and could otherwise be reused for destination.
RPT.advanceToNext(); RPT.advanceToNext();
GCNRegPressure MaxPressure = RPT.moveMaxPressure(); GCNRegPressure MaxPressure = RPT.moveMaxPressure();
unsigned Occupancy = MaxPressure.getOccupancy(*ST); unsigned Occupancy = MaxPressure.getOccupancy(
*ST,
MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
// Don't push over half the register budget. We don't want to introduce // Don't push over half the register budget. We don't want to introduce
// spilling just to form a soft clause. // spilling just to form a soft clause.

View File

@ -714,9 +714,10 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
assert(hasFP(MF)); assert(hasFP(MF));
Register FPReg = MFI->getFrameOffsetReg(); Register FPReg = MFI->getFrameOffsetReg();
assert(FPReg != AMDGPU::FP_REG); assert(FPReg != AMDGPU::FP_REG);
unsigned VGPRSize = unsigned VGPRSize = llvm::alignTo(
llvm::alignTo((ST.getAddressableNumVGPRs() - (ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) -
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) * AMDGPU::IsaInfo::getVGPRAllocGranule(&ST,
MFI->getDynamicVGPRBlockSize())) *
4, 4,
FrameInfo.getMaxAlign()); FrameInfo.getMaxAlign());
MFI->setScratchReservedForDynamicVGPRs(VGPRSize); MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
@ -2087,7 +2088,7 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
bool SIFrameLowering::mayReserveScratchForCWSR( bool SIFrameLowering::mayReserveScratchForCWSR(
const MachineFunction &MF) const { const MachineFunction &MF) const {
return MF.getSubtarget<GCNSubtarget>().isDynamicVGPREnabled() && return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) && AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) &&
AMDGPU::isCompute(MF.getFunction().getCallingConv()); AMDGPU::isCompute(MF.getFunction().getCallingConv());
} }

View File

@ -1760,7 +1760,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
else if (MI.getOpcode() == AMDGPU::S_ENDPGM || else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
if (!WCG->isOptNone() && if (!WCG->isOptNone() &&
(ST->isDynamicVGPREnabled() || (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
(ST->getGeneration() >= AMDGPUSubtarget::GFX11 && (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
ScoreBrackets.getScoreRange(STORE_CNT) != 0 && ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)))) !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
@ -2652,7 +2652,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV); Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV); Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
[[maybe_unused]] unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); [[maybe_unused]] unsigned NumVGPRsMax =
ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
[[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
@ -2821,7 +2822,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
// (i.e. whether we're in dynamic VGPR mode or not). // (i.e. whether we're in dynamic VGPR mode or not).
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
// waveslot limited kernel runs slower with the deallocation. // waveslot limited kernel runs slower with the deallocation.
if (ST->isDynamicVGPREnabled()) { if (MFI->isDynamicVGPREnabled()) {
for (MachineInstr *MI : ReleaseVGPRInsts) { for (MachineInstr *MI : ReleaseVGPRInsts) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_ALLOC_VGPR)) TII->get(AMDGPU::S_ALLOC_VGPR))
@ -2832,7 +2833,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
if (!ReleaseVGPRInsts.empty() && if (!ReleaseVGPRInsts.empty() &&
(MF.getFrameInfo().hasCalls() || (MF.getFrameInfo().hasCalls() ||
ST->getOccupancyWithNumVGPRs( ST->getOccupancyWithNumVGPRs(
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) < TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
/*IsDynamicVGPR=*/false) <
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) { AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
for (MachineInstr *MI : ReleaseVGPRInsts) { for (MachineInstr *MI : ReleaseVGPRInsts) {
if (ST->requiresNopBeforeDeallocVGPRs()) { if (ST->requiresNopBeforeDeallocVGPRs()) {

View File

@ -48,6 +48,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MaxNumWorkGroups = ST.getMaxNumWorkGroups(F); MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
assert(MaxNumWorkGroups.size() == 3); assert(MaxNumWorkGroups.size() == 3);
// Temporarily check both the attribute and the subtarget feature, until the
// latter is completely removed.
DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
Occupancy = ST.computeOccupancy(F, getLDSSize()).second; Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
CallingConv::ID CC = F.getCallingConv(); CallingConv::ID CC = F.getCallingConv();
@ -716,6 +722,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()), PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()), MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()), Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) { ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs()) for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
SpillPhysVGPRS.push_back(regToString(Reg, TRI)); SpillPhysVGPRS.push_back(regToString(Reg, TRI));

View File

@ -299,6 +299,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool HasInitWholeWave = false; bool HasInitWholeWave = false;
unsigned DynamicVGPRBlockSize = 0;
unsigned ScratchReservedForDynamicVGPRs = 0; unsigned ScratchReservedForDynamicVGPRs = 0;
SIMachineFunctionInfo() = default; SIMachineFunctionInfo() = default;
@ -352,6 +353,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg, YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
StringValue()); StringValue());
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false); YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
YamlIO.mapOptional("scratchReservedForDynamicVGPRs", YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
MFI.ScratchReservedForDynamicVGPRs, 0); MFI.ScratchReservedForDynamicVGPRs, 0);
} }
@ -469,6 +471,8 @@ private:
unsigned NumSpilledSGPRs = 0; unsigned NumSpilledSGPRs = 0;
unsigned NumSpilledVGPRs = 0; unsigned NumSpilledVGPRs = 0;
unsigned DynamicVGPRBlockSize = 0;
// The size in bytes of the scratch space reserved for the CWSR trap handler // The size in bytes of the scratch space reserved for the CWSR trap handler
// to spill some of the dynamic VGPRs. // to spill some of the dynamic VGPRs.
unsigned ScratchReservedForDynamicVGPRs = 0; unsigned ScratchReservedForDynamicVGPRs = 0;
@ -820,6 +824,9 @@ public:
BytesInStackArgArea = Bytes; BytesInStackArgArea = Bytes;
} }
bool isDynamicVGPREnabled() const { return DynamicVGPRBlockSize != 0; }
unsigned getDynamicVGPRBlockSize() const { return DynamicVGPRBlockSize; }
// This is only used if we need to save any dynamic VGPRs in scratch. // This is only used if we need to save any dynamic VGPRs in scratch.
unsigned getScratchReservedForDynamicVGPRs() const { unsigned getScratchReservedForDynamicVGPRs() const {
return ScratchReservedForDynamicVGPRs; return ScratchReservedForDynamicVGPRs;

View File

@ -3748,7 +3748,11 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
default: default:
return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF); return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
case AMDGPU::VGPR_32RegClassID: case AMDGPU::VGPR_32RegClassID:
return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF)); return std::min(
ST.getMaxNumVGPRs(
MinOcc,
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize()),
ST.getMaxNumVGPRs(MF));
case AMDGPU::SGPR_32RegClassID: case AMDGPU::SGPR_32RegClassID:
case AMDGPU::SGPR_LO16RegClassID: case AMDGPU::SGPR_LO16RegClassID:
return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF)); return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));

View File

@ -1158,10 +1158,16 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
} }
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
unsigned DynamicVGPRBlockSize,
std::optional<bool> EnableWavefrontSize32) { std::optional<bool> EnableWavefrontSize32) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts)) if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 8; return 8;
if (DynamicVGPRBlockSize != 0)
return DynamicVGPRBlockSize;
// Temporarily check the subtarget feature, until we fully switch to using
// attributes.
if (STI->getFeatureBits().test(FeatureDynamicVGPR)) if (STI->getFeatureBits().test(FeatureDynamicVGPR))
return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16; return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
@ -1205,20 +1211,26 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; } unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
unsigned DynamicVGPRBlockSize) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts)) if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 512; return 512;
if (STI->getFeatureBits().test(FeatureDynamicVGPR))
// Temporarily check the subtarget feature, until we fully switch to using
// attributes.
if (DynamicVGPRBlockSize != 0 ||
STI->getFeatureBits().test(FeatureDynamicVGPR))
// On GFX12 we can allocate at most 8 blocks of VGPRs. // On GFX12 we can allocate at most 8 blocks of VGPRs.
return 8 * getVGPRAllocGranule(STI); return 8 * getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
return getAddressableNumArchVGPRs(STI); return getAddressableNumArchVGPRs(STI);
} }
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
unsigned NumVGPRs) { unsigned NumVGPRs,
return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI), unsigned DynamicVGPRBlockSize) {
getMaxWavesPerEU(STI), return getNumWavesPerEUWithNumVGPRs(
getTotalNumVGPRs(STI)); NumVGPRs, getVGPRAllocGranule(STI, DynamicVGPRBlockSize),
getMaxWavesPerEU(STI), getTotalNumVGPRs(STI));
} }
unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule, unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
@ -1257,7 +1269,8 @@ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
return 5; return 5;
} }
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
unsigned DynamicVGPRBlockSize) {
assert(WavesPerEU != 0); assert(WavesPerEU != 0);
unsigned MaxWavesPerEU = getMaxWavesPerEU(STI); unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
@ -1265,28 +1278,33 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
return 0; return 0;
unsigned TotNumVGPRs = getTotalNumVGPRs(STI); unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI); unsigned AddrsableNumVGPRs =
unsigned Granule = getVGPRAllocGranule(STI); getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
unsigned Granule = getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule); unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);
if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule)) if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
return 0; return 0;
unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs); unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs,
DynamicVGPRBlockSize);
if (WavesPerEU < MinWavesPerEU) if (WavesPerEU < MinWavesPerEU)
return getMinNumVGPRs(STI, MinWavesPerEU); return getMinNumVGPRs(STI, MinWavesPerEU, DynamicVGPRBlockSize);
unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule); unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext); unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
return std::min(MinNumVGPRs, AddrsableNumVGPRs); return std::min(MinNumVGPRs, AddrsableNumVGPRs);
} }
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
unsigned DynamicVGPRBlockSize) {
assert(WavesPerEU != 0); assert(WavesPerEU != 0);
unsigned MaxNumVGPRs = unsigned MaxNumVGPRs =
alignDown(getTotalNumVGPRs(STI) / WavesPerEU, getVGPRAllocGranule(STI)); alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI); getVGPRAllocGranule(STI, DynamicVGPRBlockSize));
unsigned AddressableNumVGPRs =
getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
return std::min(MaxNumVGPRs, AddressableNumVGPRs); return std::min(MaxNumVGPRs, AddressableNumVGPRs);
} }
@ -1299,9 +1317,11 @@ unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI,
unsigned NumVGPRs, unsigned NumVGPRs,
unsigned DynamicVGPRBlockSize,
std::optional<bool> EnableWavefrontSize32) { std::optional<bool> EnableWavefrontSize32) {
return getGranulatedNumRegisterBlocks( return getGranulatedNumRegisterBlocks(
NumVGPRs, getVGPRAllocGranule(STI, EnableWavefrontSize32)); NumVGPRs,
getVGPRAllocGranule(STI, DynamicVGPRBlockSize, EnableWavefrontSize32));
} }
} // end namespace IsaInfo } // end namespace IsaInfo
@ -2124,6 +2144,16 @@ bool getHasDepthExport(const Function &F) {
return F.getFnAttributeAsParsedInteger("amdgpu-depth-export", 0) != 0; return F.getFnAttributeAsParsedInteger("amdgpu-depth-export", 0) != 0;
} }
unsigned getDynamicVGPRBlockSize(const Function &F) {
unsigned BlockSize =
F.getFnAttributeAsParsedInteger("amdgpu-dynamic-vgpr-block-size", 0);
if (BlockSize == 16 || BlockSize == 32)
return BlockSize;
return 0;
}
bool hasXNACK(const MCSubtargetInfo &STI) { bool hasXNACK(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureXNACK); return STI.hasFeature(AMDGPU::FeatureXNACK);
} }

View File

@ -298,7 +298,7 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
/// For subtargets which support it, \p EnableWavefrontSize32 should match /// For subtargets which support it, \p EnableWavefrontSize32 should match
/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field. /// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
unsigned unsigned
getVGPRAllocGranule(const MCSubtargetInfo *STI, getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize,
std::optional<bool> EnableWavefrontSize32 = std::nullopt); std::optional<bool> EnableWavefrontSize32 = std::nullopt);
/// \returns VGPR encoding granularity for given subtarget \p STI. /// \returns VGPR encoding granularity for given subtarget \p STI.
@ -321,20 +321,24 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI); unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);
/// \returns Addressable number of VGPRs for given subtarget \p STI. /// \returns Addressable number of VGPRs for given subtarget \p STI.
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI); unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
unsigned DynamicVGPRBlockSize);
/// \returns Minimum number of VGPRs that meets given number of waves per /// \returns Minimum number of VGPRs that meets given number of waves per
/// execution unit requirement for given subtarget \p STI. /// execution unit requirement for given subtarget \p STI.
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
unsigned DynamicVGPRBlockSize);
/// \returns Maximum number of VGPRs that meets given number of waves per /// \returns Maximum number of VGPRs that meets given number of waves per
/// execution unit requirement for given subtarget \p STI. /// execution unit requirement for given subtarget \p STI.
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
unsigned DynamicVGPRBlockSize);
/// \returns Number of waves reachable for a given \p NumVGPRs usage for given /// \returns Number of waves reachable for a given \p NumVGPRs usage for given
/// subtarget \p STI. /// subtarget \p STI.
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI, unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
unsigned NumVGPRs); unsigned NumVGPRs,
unsigned DynamicVGPRBlockSize);
/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule /// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
/// size, \p MaxWaves possible, and \p TotalNumVGPRs available. /// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
@ -361,6 +365,7 @@ unsigned getEncodedNumVGPRBlocks(
/// subtarget \p STI when \p NumVGPRs are used. /// subtarget \p STI when \p NumVGPRs are used.
unsigned getAllocatedNumVGPRBlocks( unsigned getAllocatedNumVGPRBlocks(
const MCSubtargetInfo *STI, unsigned NumVGPRs, const MCSubtargetInfo *STI, unsigned NumVGPRs,
unsigned DynamicVGPRBlockSize,
std::optional<bool> EnableWavefrontSize32 = std::nullopt); std::optional<bool> EnableWavefrontSize32 = std::nullopt);
} // end namespace IsaInfo } // end namespace IsaInfo
@ -1305,6 +1310,12 @@ bool getHasColorExport(const Function &F);
bool getHasDepthExport(const Function &F); bool getHasDepthExport(const Function &F);
bool hasDynamicVGPR(const Function &F);
// Returns the value of the "amdgpu-dynamic-vgpr-block-size" attribute, or 0 if
// the attribute is missing or its value is invalid.
unsigned getDynamicVGPRBlockSize(const Function &F);
LLVM_READNONE LLVM_READNONE
constexpr bool isShader(CallingConv::ID CC) { constexpr bool isShader(CallingConv::ID CC) {
switch (CC) { switch (CC) {

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s
; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack. ; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
@ -137,7 +137,7 @@ define amdgpu_cs void @with_calls_no_inline_const() #0 {
ret void ret void
} }
define amdgpu_cs void @with_spills() { define amdgpu_cs void @with_spills() #0 {
; CHECK-LABEL: with_spills: ; CHECK-LABEL: with_spills:
; CHECK: ; %bb.0: ; CHECK: ; %bb.0:
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) ; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
@ -366,6 +366,6 @@ define void @default() #0 {
declare amdgpu_gfx void @callee(i32) #0 declare amdgpu_gfx void @callee(i32) #0
attributes #0 = { nounwind } attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
attributes #1 = { nounwind "frame-pointer"="none" } attributes #1 = { nounwind "frame-pointer"="none" "amdgpu-dynamic-vgpr-block-size"="16" }
attributes #2 = { nounwind "frame-pointer"="all" } attributes #2 = { nounwind "frame-pointer"="all" "amdgpu-dynamic-vgpr-block-size"="16" }

View File

@ -1,4 +1,4 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack. ; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
@ -68,5 +68,5 @@ define void @default() #0 {
declare amdgpu_gfx void @callee(i32) #0 declare amdgpu_gfx void @callee(i32) #0
attributes #0 = { nounwind } attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size" = "16" }

View File

@ -0,0 +1,305 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
; CHECK-NEXT: amdpal.pipelines:
; CHECK-NEXT: - .api: Vulkan
; CHECK-NEXT: .compute_registers:
; CHECK-NEXT: .dynamic_vgpr_en: true
; CHECK-NEXT: .tg_size_en: true
; CHECK-NEXT: .tgid_x_en: false
; CHECK-NEXT: .tgid_y_en: false
; CHECK-NEXT: .tgid_z_en: false
; CHECK-NEXT: .tidig_comp_cnt: 0x1
; CHECK-NEXT: .hardware_stages:
; CHECK-NEXT: .cs:
; CHECK-NEXT: .checksum_value: 0x9444d7d0
; CHECK-NEXT: .debug_mode: 0
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0x200
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .sgpr_limit: 0x6a
; CHECK-NEXT: .threadgroup_dimensions:
; CHECK-NEXT: - 0x1
; CHECK-NEXT: - 0x400
; CHECK-NEXT: - 0x1
; CHECK-NEXT: .trap_present: false
; CHECK-NEXT: .user_data_reg_map:
; CHECK-NEXT: - 0x10000000
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: .user_sgprs: 0x3
; CHECK-NEXT: .vgpr_limit: 0x100
; CHECK-NEXT: .wavefront_size: 0x40
; CHECK-NEXT: .wgp_mode: true
; CHECK: .registers: {}
; CHECK-NEXT: .shader_functions:
; CHECK-NEXT: dynamic_stack:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: dynamic_stack_loop:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: multiple_stack:
; CHECK-NEXT: .backend_stack_size: 0x24
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: no_stack:
; CHECK-NEXT: .backend_stack_size: 0
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: no_stack_call:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: no_stack_extern_call:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: no_stack_extern_call_many_args:
; CHECK-NEXT: .backend_stack_size: 0x90
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x90
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: no_stack_indirect_call:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: simple_lds:
; CHECK-NEXT: .backend_stack_size: 0
; CHECK-NEXT: .lds_size: 0x100
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: simple_lds_recurse:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0x100
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x29
; CHECK-NEXT: simple_stack:
; CHECK-NEXT: .backend_stack_size: 0x14
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: simple_stack_call:
; CHECK-NEXT: .backend_stack_size: 0x20
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x4
; CHECK-NEXT: simple_stack_extern_call:
; CHECK-NEXT: .backend_stack_size: 0x20
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: simple_stack_indirect_call:
; CHECK-NEXT: .backend_stack_size: 0x20
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: simple_stack_recurse:
; CHECK-NEXT: .backend_stack_size: 0x20
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x2a
; CHECK:amdpal.version:
; CHECK-NEXT: - 0x3
; CHECK-NEXT: - 0
; CHECK-NEXT:...
; CHECK-NEXT: .end_amdgpu_pal_metadata
declare amdgpu_gfx float @extern_func(float) #0
declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4
define amdgpu_gfx float @no_stack(float %arg0) #0 {
%add = fadd float %arg0, 1.0
ret float %add
}
define amdgpu_gfx float @simple_stack(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
ret float %add
}
define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
%stack2 = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack2
%val2 = load volatile float, ptr addrspace(5) %stack2
%add2 = fadd float %add, %val2
ret float %add2
}
define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
bb0:
%cmp = fcmp ogt float %arg0, 0.0
br i1 %cmp, label %bb1, label %bb2
bb1:
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
br label %bb2
bb2:
%res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
ret float %res
}
define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
bb0:
br label %bb1
bb1:
%ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
%cmp = icmp sgt i32 %ctr, 0
%newctr = sub i32 %ctr, 1
br i1 %cmp, label %bb1, label %bb2
bb2:
ret float %add
}
define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
%res = call amdgpu_gfx float @simple_stack(float %arg0)
ret float %res
}
define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%res = call amdgpu_gfx float @simple_stack(float %arg0)
%add = fadd float %res, %val
ret float %add
}
define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
%res = call amdgpu_gfx float @extern_func(float %arg0)
ret float %res
}
define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%res = call amdgpu_gfx float @extern_func(float %arg0)
%add = fadd float %res, %val
ret float %add
}
define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
%res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
ret float %res
}
define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
%fptr = load ptr, ptr addrspace(4) @funcptr
call amdgpu_gfx void %fptr()
ret float %arg0
}
define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%fptr = load ptr, ptr addrspace(4) @funcptr
call amdgpu_gfx void %fptr()
%add = fadd float %arg0, %val
ret float %add
}
define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
%add = fadd float %res, %val
ret float %add
}
@lds = internal addrspace(3) global [64 x float] poison
define amdgpu_gfx float @simple_lds(float %arg0) #0 {
%val = load float, ptr addrspace(3) @lds
ret float %val
}
define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
%val = load float, ptr addrspace(3) @lds
%res = call amdgpu_gfx float @simple_lds_recurse(float %val)
ret float %res
}
attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
!amdgpu.pal.metadata.msgpack = !{!0}
!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
!1 = !{i32 7}

View File

@ -0,0 +1,205 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s
; CHECK-LABEL: {{^}}_amdgpu_cs_main:
; CHECK: ; TotalNumSgprs: 34
; CHECK: ; NumVgprs: 2
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
; CHECK-NEXT: amdpal.pipelines:
; CHECK-NEXT: - .api: Vulkan
; CHECK-NEXT: .compute_registers:
; CHECK-NEXT: .dynamic_vgpr_en: true
; CHECK-NEXT: .tg_size_en: true
; CHECK-NEXT: .tgid_x_en: false
; CHECK-NEXT: .tgid_y_en: false
; CHECK-NEXT: .tgid_z_en: false
; CHECK-NEXT: .tidig_comp_cnt: 0x1
; CHECK-NEXT: .graphics_registers:
; CHECK-NEXT: .ps_extra_lds_size: 0
; CHECK-NEXT: .spi_ps_input_addr:
; CHECK-NEXT: .ancillary_ena: false
; CHECK-NEXT: .front_face_ena: true
; CHECK-NEXT: .line_stipple_tex_ena: false
; CHECK-NEXT: .linear_center_ena: true
; CHECK-NEXT: .linear_centroid_ena: true
; CHECK-NEXT: .linear_sample_ena: true
; CHECK-NEXT: .persp_center_ena: true
; CHECK-NEXT: .persp_centroid_ena: true
; CHECK-NEXT: .persp_pull_model_ena: false
; CHECK-NEXT: .persp_sample_ena: true
; CHECK-NEXT: .pos_fixed_pt_ena: true
; CHECK-NEXT: .pos_w_float_ena: false
; CHECK-NEXT: .pos_x_float_ena: false
; CHECK-NEXT: .pos_y_float_ena: false
; CHECK-NEXT: .pos_z_float_ena: false
; CHECK-NEXT: .sample_coverage_ena: false
; CHECK-NEXT: .spi_ps_input_ena:
; CHECK-NEXT: .ancillary_ena: false
; CHECK-NEXT: .front_face_ena: false
; CHECK-NEXT: .line_stipple_tex_ena: false
; CHECK-NEXT: .linear_center_ena: false
; CHECK-NEXT: .linear_centroid_ena: false
; CHECK-NEXT: .linear_sample_ena: false
; CHECK-NEXT: .persp_center_ena: false
; CHECK-NEXT: .persp_centroid_ena: false
; CHECK-NEXT: .persp_pull_model_ena: false
; CHECK-NEXT: .persp_sample_ena: true
; CHECK-NEXT: .pos_fixed_pt_ena: false
; CHECK-NEXT: .pos_w_float_ena: false
; CHECK-NEXT: .pos_x_float_ena: false
; CHECK-NEXT: .pos_y_float_ena: false
; CHECK-NEXT: .pos_z_float_ena: false
; CHECK-NEXT: .sample_coverage_ena: false
; CHECK-NEXT: .hardware_stages:
; CHECK-NEXT: .cs:
; CHECK-NEXT: .checksum_value: 0x9444d7d0
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .dynamic_vgpr_saved_count: 0x70
; CHECK-NEXT: .entry_point: _amdgpu_cs
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .sgpr_limit: 0x6a
; CHECK-NEXT: .threadgroup_dimensions:
; CHECK-NEXT: - 0x1
; CHECK-NEXT: - 0x400
; CHECK-NEXT: - 0x1
; CHECK-NEXT: .trap_present: false
; CHECK-NEXT: .user_data_reg_map:
; CHECK-NEXT: - 0x10000000
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: .user_sgprs: 0x3
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: .vgpr_limit: 0x100
; CHECK-NEXT: .wavefront_size: 0x40
; CHECK-NEXT: .wgp_mode: false
; CHECK-NEXT: .gs:
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_gs
; CHECK-NEXT: .entry_point_symbol: gs_shader
; CHECK-NEXT: .lds_size: 0x200
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: .wgp_mode: true
; CHECK-NEXT: .hs:
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_hs
; CHECK-NEXT: .entry_point_symbol: hs_shader
; CHECK-NEXT: .lds_size: 0x1000
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: .wgp_mode: true
; CHECK-NEXT: .ps:
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_ps
; CHECK-NEXT: .entry_point_symbol: ps_shader
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: .wgp_mode: true
; CHECK: .registers: {}
; CHECK:amdpal.version:
; CHECK-NEXT: - 0x3
; CHECK-NEXT: - 0
; CHECK-NEXT:...
; CHECK-NEXT: .end_amdgpu_pal_metadata
define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
.entry:
%i = call i64 @llvm.amdgcn.s.getpc()
%i1 = and i64 %i, -4294967296
%i2 = zext i32 %arg1 to i64
%i3 = or i64 %i1, %i2
%i4 = inttoptr i64 %i3 to ptr addrspace(4)
%i5 = and i32 %arg2, 1023
%i6 = lshr i32 %arg2, 10
%i7 = and i32 %i6, 1023
%i8 = add nuw nsw i32 %i7, %i5
%i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
%.idx = shl nuw nsw i32 %i8, 2
call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
ret void
}
define dllexport amdgpu_ps void @ps_shader() #1 {
ret void
}
@LDS.GS = external addrspace(3) global [1 x i32], align 4
define dllexport amdgpu_gs void @gs_shader() #2 {
%ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
store i32 0, ptr addrspace(3) %ptr, align 4
ret void
}
@LDS.HS = external addrspace(3) global [1024 x i32], align 4
define dllexport amdgpu_hs void @hs_shader() #2 {
%ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
store i32 0, ptr addrspace(3) %ptr, align 4
ret void
}
!amdgpu.pal.metadata.msgpack = !{!0}
; Function Attrs: nounwind willreturn memory(none)
declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.amdgcn.s.getpc() #2
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode" "amdgpu-dynamic-vgpr-block-size"="16" }
attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
!1 = !{i32 7}

View File

@ -0,0 +1,340 @@
# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
--- |
define amdgpu_ps void @tbuffer_store1() #0 { ret void }
define amdgpu_ps void @tbuffer_store2() #0 { ret void }
define amdgpu_ps void @flat_store() #0 { ret void }
define amdgpu_ps void @global_store() #0 { ret void }
define amdgpu_ps void @buffer_store_format() #0 { ret void }
define amdgpu_ps void @ds_write_b32() #0 { ret void }
define amdgpu_ps void @global_store_dword() #0 { ret void }
define amdgpu_ps void @multiple_basic_blocks1() #0 { ret void }
define amdgpu_ps void @multiple_basic_blocks2() #0 { ret void }
define amdgpu_ps void @multiple_basic_blocks3() #0 { ret void }
define amdgpu_ps void @recursive_loop() #0 { ret void }
define amdgpu_ps void @recursive_loop_vmem() #0 { ret void }
define amdgpu_ps void @image_store() #0 { ret void }
define amdgpu_ps void @scratch_store() #0 { ret void }
define amdgpu_ps void @buffer_atomic() #0 { ret void }
define amdgpu_ps void @flat_atomic() #0 { ret void }
define amdgpu_ps void @global_atomic() #0 { ret void }
define amdgpu_ps void @image_atomic() #0 { ret void }
define amdgpu_ps void @global_store_optnone() #1 { ret void }
attributes #0 = { "amdgpu-dynamic-vgpr-block-size" = "16" }
attributes #1 = { "amdgpu-dynamic-vgpr-block-size" = "16" noinline optnone }
...
---
name: tbuffer_store1
body: |
bb.0:
; CHECK-LABEL: name: tbuffer_store1
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: tbuffer_store2
body: |
bb.0:
; CHECK-LABEL: name: tbuffer_store2
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
S_ENDPGM 0
...
---
name: flat_store
body: |
bb.0:
; CHECK-LABEL: name: flat_store
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: global_store
body: |
bb.0:
; CHECK-LABEL: name: global_store
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
S_WAIT_STORECNT 0
S_ENDPGM 0
...
---
name: buffer_store_format
body: |
bb.0:
; CHECK-LABEL: name: buffer_store_format
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: ds_write_b32
body: |
bb.0:
; CHECK-LABEL: name: ds_write_b32
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $vgpr0 = IMPLICIT_DEF
renamable $vgpr1 = IMPLICIT_DEF
DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0
S_ENDPGM 0
...
---
name: global_store_dword
body: |
bb.0:
liveins: $vgpr0, $sgpr0_sgpr1
; CHECK-LABEL: name: global_store_dword
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: multiple_basic_blocks1
body: |
; CHECK-LABEL: name: multiple_basic_blocks1
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
name: multiple_basic_blocks2
body: |
; CHECK-LABEL: name: multiple_basic_blocks2
; CHECK: bb.2:
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.2
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_BRANCH %bb.2
bb.1:
successors: %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
name: multiple_basic_blocks3
body: |
; CHECK-LABEL: name: multiple_basic_blocks3
; CHECK: bb.4:
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
S_BRANCH %bb.2
bb.1:
successors: %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_BRANCH %bb.2
bb.2:
successors: %bb.4
S_BRANCH %bb.4
bb.3:
successors: %bb.4
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_BRANCH %bb.4
bb.4:
S_ENDPGM 0
...
---
name: recursive_loop
body: |
; CHECK-LABEL: name: recursive_loop
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
name: recursive_loop_vmem
body: |
; CHECK-LABEL: name: recursive_loop_vmem
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
name: image_store
body: |
bb.0:
; CHECK-LABEL: name: image_store
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
S_ENDPGM 0
...
---
name: scratch_store
body: |
bb.0:
; CHECK-LABEL: name: scratch_store
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: buffer_atomic
body: |
bb.0:
; CHECK-LABEL: name: buffer_atomic
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
S_ENDPGM 0
...
---
name: flat_atomic
body: |
bb.0:
; CHECK-LABEL: name: flat_atomic
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: global_atomic
body: |
bb.0:
; CHECK-LABEL: name: global_atomic
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
S_ENDPGM 0
...
---
name: image_atomic
body: |
bb.0:
; CHECK-LABEL: name: image_atomic
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
S_ENDPGM 0
...
---
name: global_store_optnone
body: |
bb.0:
; CHECK-LABEL: name: global_store_optnone
; CHECK-NOT: S_SENDMSG 3
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
S_WAIT_STORECNT 0
S_ENDPGM 0
...

View File

@ -1,5 +1,4 @@
# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT # RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
# RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
--- | --- |
define amdgpu_ps void @tbuffer_store1() { ret void } define amdgpu_ps void @tbuffer_store1() { ret void }
@ -29,8 +28,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: tbuffer_store1 ; CHECK-LABEL: name: tbuffer_store1
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
S_ENDPGM 0 S_ENDPGM 0
@ -42,8 +40,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: tbuffer_store2 ; CHECK-LABEL: name: tbuffer_store2
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
S_ENDPGM 0 S_ENDPGM 0
@ -55,8 +52,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: flat_store ; CHECK-LABEL: name: flat_store
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0 S_ENDPGM 0
@ -68,8 +64,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: global_store ; CHECK-LABEL: name: global_store
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
S_WAIT_STORECNT 0 S_WAIT_STORECNT 0
@ -82,8 +77,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: buffer_store_format ; CHECK-LABEL: name: buffer_store_format
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
S_ENDPGM 0 S_ENDPGM 0
@ -95,8 +89,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: ds_write_b32 ; CHECK-LABEL: name: ds_write_b32
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
renamable $vgpr0 = IMPLICIT_DEF renamable $vgpr0 = IMPLICIT_DEF
renamable $vgpr1 = IMPLICIT_DEF renamable $vgpr1 = IMPLICIT_DEF
@ -112,8 +105,7 @@ body: |
; CHECK-LABEL: name: global_store_dword ; CHECK-LABEL: name: global_store_dword
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
@ -125,8 +117,7 @@ name: multiple_basic_blocks1
body: | body: |
; CHECK-LABEL: name: multiple_basic_blocks1 ; CHECK-LABEL: name: multiple_basic_blocks1
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
bb.0: bb.0:
successors: %bb.1 successors: %bb.1
@ -153,8 +144,7 @@ body: |
; CHECK-LABEL: name: multiple_basic_blocks2 ; CHECK-LABEL: name: multiple_basic_blocks2
; CHECK: bb.2: ; CHECK: bb.2:
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
bb.0: bb.0:
successors: %bb.2 successors: %bb.2
@ -180,8 +170,7 @@ body: |
; CHECK-LABEL: name: multiple_basic_blocks3 ; CHECK-LABEL: name: multiple_basic_blocks3
; CHECK: bb.4: ; CHECK: bb.4:
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
bb.0: bb.0:
successors: %bb.2 successors: %bb.2
@ -216,8 +205,7 @@ name: recursive_loop
body: | body: |
; CHECK-LABEL: name: recursive_loop ; CHECK-LABEL: name: recursive_loop
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
bb.0: bb.0:
successors: %bb.1 successors: %bb.1
@ -241,8 +229,7 @@ name: recursive_loop_vmem
body: | body: |
; CHECK-LABEL: name: recursive_loop_vmem ; CHECK-LABEL: name: recursive_loop_vmem
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
bb.0: bb.0:
successors: %bb.1 successors: %bb.1
@ -268,8 +255,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: image_store ; CHECK-LABEL: name: image_store
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7) IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
S_ENDPGM 0 S_ENDPGM 0
@ -281,8 +267,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: scratch_store ; CHECK-LABEL: name: scratch_store
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
@ -295,8 +280,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: buffer_atomic ; CHECK-LABEL: name: buffer_atomic
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7) BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
S_ENDPGM 0 S_ENDPGM 0
@ -308,8 +292,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: flat_atomic ; CHECK-LABEL: name: flat_atomic
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
S_ENDPGM 0 S_ENDPGM 0
@ -322,8 +305,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: global_atomic ; CHECK-LABEL: name: global_atomic
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
S_ENDPGM 0 S_ENDPGM 0
@ -335,8 +317,7 @@ body: |
bb.0: bb.0:
; CHECK-LABEL: name: image_atomic ; CHECK-LABEL: name: image_atomic
; CHECK-NOT: S_SENDMSG 3 ; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR ; CHECK-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0 ; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7) renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
S_ENDPGM 0 S_ENDPGM 0

View File

@ -44,6 +44,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body: ; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 { define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
@ -312,6 +313,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body: ; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 { define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {

View File

@ -44,6 +44,7 @@
; AFTER-PEI-NEXT: sgprForEXECCopy: '' ; AFTER-PEI-NEXT: sgprForEXECCopy: ''
; AFTER-PEI-NEXT: longBranchReservedReg: '' ; AFTER-PEI-NEXT: longBranchReservedReg: ''
; AFTER-PEI-NEXT: hasInitWholeWave: false ; AFTER-PEI-NEXT: hasInitWholeWave: false
; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0
; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0 ; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
; AFTER-PEI-NEXT: body: ; AFTER-PEI-NEXT: body:
define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {

View File

@ -44,6 +44,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body: ; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 { define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {

View File

@ -44,6 +44,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3' ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body: ; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 { define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {

View File

@ -53,6 +53,7 @@
# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body: # FULL-NEXT: body:
@ -159,6 +160,7 @@ body: |
# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body: # FULL-NEXT: body:
@ -236,6 +238,7 @@ body: |
# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body: # FULL-NEXT: body:
@ -314,6 +317,7 @@ body: |
# FULL-NEXT: sgprForEXECCopy: '' # FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: '' # FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false # FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0 # FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body: # FULL-NEXT: body:

View File

@ -54,6 +54,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body: ; CHECK-NEXT: body:
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
@ -102,6 +103,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body: ; CHECK-NEXT: body:
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
@ -174,6 +176,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body: ; CHECK-NEXT: body:
define void @function() { define void @function() {
@ -228,6 +231,7 @@ define void @function() {
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' ; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '' ; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false ; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0 ; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body: ; CHECK-NEXT: body:
define void @function_nsz() #0 { define void @function_nsz() #0 {

View File

@ -93,16 +93,16 @@ static const std::pair<StringRef, StringRef>
W32FS = {"+wavefrontsize32", "w32"}, W32FS = {"+wavefrontsize32", "w32"},
W64FS = {"+wavefrontsize64", "w64"}; W64FS = {"+wavefrontsize64", "w64"};
using TestFuncTy = using TestFuncTy = function_ref<bool(std::stringstream &, unsigned,
function_ref<bool(std::stringstream &, unsigned, const GCNSubtarget &)>; const GCNSubtarget &, bool)>;
static bool testAndRecord(std::stringstream &Table, const GCNSubtarget &ST, static bool testAndRecord(std::stringstream &Table, const GCNSubtarget &ST,
TestFuncTy test) { TestFuncTy test, unsigned DynamicVGPRBlockSize) {
bool Success = true; bool Success = true;
unsigned MaxOcc = ST.getMaxWavesPerEU(); unsigned MaxOcc = ST.getMaxWavesPerEU();
for (unsigned Occ = MaxOcc; Occ > 0; --Occ) { for (unsigned Occ = MaxOcc; Occ > 0; --Occ) {
Table << std::right << std::setw(3) << Occ << " "; Table << std::right << std::setw(3) << Occ << " ";
Success = test(Table, Occ, ST) && Success; Success = test(Table, Occ, ST, DynamicVGPRBlockSize) && Success;
Table << '\n'; Table << '\n';
} }
return Success; return Success;
@ -132,7 +132,7 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
FS = &W32FS; FS = &W32FS;
std::stringstream Table; std::stringstream Table;
bool Success = testAndRecord(Table, ST, test); bool Success = testAndRecord(Table, ST, test, /*DynamicVGPRBlockSize=*/0);
if (!Success || PrintCpuRegLimits) if (!Success || PrintCpuRegLimits)
TablePerCPUs[Table.str()].push_back((CanonCPUName + FS->second).str()); TablePerCPUs[Table.str()].push_back((CanonCPUName + FS->second).str());
@ -155,40 +155,50 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS, static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
TestFuncTy test) { TestFuncTy test) {
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
"+dynamic-vgpr," + FS.str());
ASSERT_TRUE(TM) << "No target machine"; ASSERT_TRUE(TM) << "No target machine";
GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
std::string(TM->getTargetFeatureString()), *TM); std::string(TM->getTargetFeatureString()), *TM);
ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
auto testWithBlockSize = [&](unsigned DynamicVGPRBlockSize) {
std::stringstream Table; std::stringstream Table;
bool Success = testAndRecord(Table, ST, test); bool Success = testAndRecord(Table, ST, test, DynamicVGPRBlockSize);
EXPECT_TRUE(Success && !PrintCpuRegLimits) EXPECT_TRUE(Success && !PrintCpuRegLimits)
<< CPUName << " dynamic VGPR " << FS << CPUName << " dynamic VGPR block size " << DynamicVGPRBlockSize
<< ":\nOcc MinVGPR MaxVGPR\n" << ":\nOcc MinVGPR MaxVGPR\n"
<< Table.str() << '\n'; << Table.str() << '\n';
};
testWithBlockSize(16);
testWithBlockSize(32);
} }
TEST(AMDGPU, TestVGPRLimitsPerOccupancy) { TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) { auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST,
unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(); unsigned DynamicVGPRBlockSize) {
unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(DynamicVGPRBlockSize);
return checkMinMax( return checkMinMax(
OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum), ST.getMaxWavesPerEU(), OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum, DynamicVGPRBlockSize),
[&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs); }, ST.getMaxWavesPerEU(),
[&](unsigned Occ) { return ST.getMinNumVGPRs(Occ); }, [&](unsigned NumGPRs) {
[&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ); }); return ST.getOccupancyWithNumVGPRs(NumGPRs, DynamicVGPRBlockSize);
},
[&](unsigned Occ) {
return ST.getMinNumVGPRs(Occ, DynamicVGPRBlockSize);
},
[&](unsigned Occ) {
return ST.getMaxNumVGPRs(Occ, DynamicVGPRBlockSize);
});
}; };
testGPRLimits("VGPR", true, test); testGPRLimits("VGPR", true, test);
testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test); testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
testDynamicVGPRLimits("gfx1200",
"+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
} }
static void testAbsoluteLimits(StringRef CPUName, StringRef FS, static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
unsigned DynamicVGPRBlockSize,
unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc, unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
unsigned ExpectedMaxVGPRs) { unsigned ExpectedMaxVGPRs) {
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS); auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
@ -206,11 +216,15 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
Func->setCallingConv(CallingConv::AMDGPU_CS_Chain); Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
Func->addFnAttr("amdgpu-flat-work-group-size", "1,32"); Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
std::string DVGPRBlockSize = std::to_string(DynamicVGPRBlockSize);
if (DynamicVGPRBlockSize)
Func->addFnAttr("amdgpu-dynamic-vgpr-block-size", DVGPRBlockSize);
auto Range = ST.getWavesPerEU(*Func); auto Range = ST.getWavesPerEU(*Func);
EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS; EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS; EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS; EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs()) EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs(DynamicVGPRBlockSize))
<< CPUName << ' ' << FS; << CPUName << ' ' << FS;
// Function with requested 'amdgpu-waves-per-eu' in a valid range. // Function with requested 'amdgpu-waves-per-eu' in a valid range.
@ -221,11 +235,10 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
} }
TEST(AMDGPU, TestOccupancyAbsoluteLimits) { TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256); // CPUName, Features, DynamicVGPRBlockSize; Expected MinOcc, MaxOcc, MaxVGPRs
testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128); testAbsoluteLimits("gfx1200", "+wavefrontsize32", 0, 1, 16, 256);
testAbsoluteLimits( testAbsoluteLimits("gfx1200", "+wavefrontsize32", 16, 1, 16, 128);
"gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32", testAbsoluteLimits("gfx1200", "+wavefrontsize32", 32, 1, 16, 256);
1, 16, 256);
} }
static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) { static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {