[AMDGPU] Replace dynamic VGPR feature with attribute (#133444)

Use a function attribute (amdgpu-dynamic-vgpr) instead of a subtarget
feature, as requested in #130030.
This commit is contained in:
Diana Picus 2025-06-24 11:09:36 +02:00 committed by GitHub
parent 6cfa03f1f1
commit a201f8872a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
34 changed files with 1205 additions and 199 deletions

View File

@ -768,11 +768,6 @@ For example:
performant than code generated for XNACK replay
disabled.
dynamic-vgpr TODO Represents the "Dynamic VGPR" hardware mode, introduced in GFX12.
Waves launched in this mode may allocate or deallocate the VGPRs
using dedicated instructions, but may not send the DEALLOC_VGPRS
message.
=============== ============================ ==================================================
.. _amdgpu-target-id:
@ -1764,6 +1759,15 @@ The AMDGPU backend supports the following LLVM IR attributes.
"amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors.
"amdgpu-dynamic-vgpr-block-size" Represents the size of a VGPR block in the "Dynamic VGPR" hardware mode,
introduced in GFX12.
A value of 0 (default) means that dynamic VGPRs are not enabled.
Valid values for GFX12+ are 16 and 32.
Waves launched in this mode may allocate or deallocate the VGPRs
using dedicated instructions, but may not send the DEALLOC_VGPRS
message. If a shader has this attribute, then all its callees must
match its value.
================================================ ==========================================================
Calling Conventions

View File

@ -1281,12 +1281,14 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
>;
// FIXME: Remove after all users are migrated to attribute.
def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
"DynamicVGPR",
"true",
"Enable dynamic VGPR mode"
>;
// FIXME: Remove after all users are migrated to attribute.
def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
"DynamicVGPRBlockSize32",
"true",

View File

@ -452,15 +452,17 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
unsigned MaxWaves = MFI.getMaxWavesPerEU();
uint64_t TotalNumVgpr =
getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
uint64_t NumVGPRsForWavesPerEU = std::max(
{TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
uint64_t NumVGPRsForWavesPerEU =
std::max({TotalNumVgpr, (uint64_t)1,
(uint64_t)STM.getMinNumVGPRs(
MaxWaves, MFI.getDynamicVGPRBlockSize())});
uint64_t NumSGPRsForWavesPerEU = std::max(
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
STM.getOccupancyWithWorkGroupSizes(*MF).second,
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
OutContext);
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
MFI.getDynamicVGPRBlockSize(), STM, OutContext);
uint64_t Occupancy;
const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
@ -1082,7 +1084,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
Ctx);
ProgInfo.NumVGPRsForWavesPerEU =
AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
CreateExpr(STM.getMinNumVGPRs(
MaxWaves, MFI->getDynamicVGPRBlockSize()))},
Ctx);
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
@ -1256,7 +1259,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
MFI->getDynamicVGPRBlockSize(), STM, Ctx);
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
@ -1405,7 +1409,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
// Helper function to add common PAL Metadata 3.0+
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
const SIProgramInfo &CurrentProgramInfo,
CallingConv::ID CC, const GCNSubtarget &ST) {
CallingConv::ID CC, const GCNSubtarget &ST,
unsigned DynamicVGPRBlockSize) {
if (ST.hasIEEEMode())
MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
@ -1417,7 +1422,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
(bool)CurrentProgramInfo.TrapHandlerEnable);
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
if (ST.isDynamicVGPREnabled())
if (DynamicVGPRBlockSize != 0)
MD->setComputeRegisters(".dynamic_vgpr_en", true);
}
@ -1444,7 +1449,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
// For targets that support dynamic VGPRs, set the number of saved dynamic
// VGPRs (if any) in the PAL metadata.
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.isDynamicVGPREnabled() &&
if (MFI->isDynamicVGPREnabled() &&
MFI->getScratchReservedForDynamicVGPRs() > 0)
MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
MFI->getScratchReservedForDynamicVGPRs() / 4);
@ -1470,7 +1475,8 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
CurrentProgramInfo.ScratchEnable);
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
MFI->getDynamicVGPRBlockSize());
}
// ScratchSize is in bytes, 16 aligned.
@ -1541,7 +1547,9 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
MD->setRsrc2(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
} else {
EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
EmitPALMetadataCommon(
MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
}
// Set optional info

View File

@ -173,8 +173,16 @@ static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
return 128;
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
// Temporarily check both the attribute and the subtarget feature, until the
// latter is removed.
if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
unsigned MaxVGPRs = ST.getMaxNumVGPRs(
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first);
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
DynamicVGPRBlockSize);
// A non-entry function has only 32 caller preserved registers.
// Do not promote alloca which will force spilling unless we know the function

View File

@ -448,7 +448,10 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// TODO: assert Regions are sorted descending by pressure
const auto &ST = MF.getSubtarget<GCNSubtarget>();
const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
const unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
const auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
<< ", current = " << Occ << '\n');
@ -457,7 +460,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// Always build the DAG to add mutations
BuildDAG DAG(*R, *this);
if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc)
continue;
LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
@ -468,7 +471,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
printSchedRP(dbgs(), R->MaxPressure, MaxRP));
NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize));
if (NewOcc <= Occ)
break;
@ -489,9 +492,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const auto &ST = MF.getSubtarget<GCNSubtarget>();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy();
unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) {
@ -522,19 +527,21 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const auto RP = getRegionPressure(*R);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
if (RP.getOccupancy(ST) < TgtOcc) {
if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
ST, DynamicVGPRBlockSize) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
} else {
LLVM_DEBUG(dbgs() << ", restoring\n");
Ovr.restoreOrder();
assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >=
TgtOcc);
}
}
FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
FinalOccupancy =
std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
}
MFI->limitOccupancy(FinalOccupancy);
@ -580,9 +587,11 @@ void GCNIterativeScheduler::scheduleILP(
const auto &ST = MF.getSubtarget<GCNSubtarget>();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy();
unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) {
@ -603,17 +612,18 @@ void GCNIterativeScheduler::scheduleILP(
const auto RP = getSchedulePressure(*R, ILPSchedule);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
if (RP.getOccupancy(ST) < TgtOcc) {
if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
ST, DynamicVGPRBlockSize) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
}
} else {
scheduleRegion(*R, ILPSchedule, RP);
LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
FinalOccupancy =
std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
}
MFI->limitOccupancy(FinalOccupancy);

View File

@ -251,7 +251,9 @@ bool GCNNSAReassignImpl::run(MachineFunction &MF) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
MaxNumVGPRs = std::min(
ST->getMaxNumVGPRs(MFI->getOccupancy(), MFI->getDynamicVGPRBlockSize()),
MaxNumVGPRs);
CSRegs = MRI->getCalleeSavedRegs();
using Candidate = std::pair<const MachineInstr*, bool>;

View File

@ -13,6 +13,7 @@
#include "GCNRegPressure.h"
#include "AMDGPU.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@ -94,17 +95,20 @@ void GCNRegPressure::inc(unsigned Reg,
bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
const auto SGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(getSGPRNum()));
const auto VGPROcc =
std::min(MaxOccupancy,
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
const auto VGPROcc = std::min(
MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
DynamicVGPRBlockSize));
const auto OtherSGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
const auto OtherVGPROcc =
std::min(MaxOccupancy,
ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts())));
std::min(MaxOccupancy,
ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()),
DynamicVGPRBlockSize));
const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@ -226,13 +230,15 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
O.getVGPRNum(ST.hasGFX90AInsts()));
}
Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
return Printable([&RP, ST](raw_ostream &OS) {
Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
unsigned DynamicVGPRBlockSize) {
return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
OS << "VGPRs: " << RP.getArchVGPRNum() << ' '
<< "AGPRs: " << RP.getAGPRNum();
if (ST)
OS << "(O"
<< ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()))
<< ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()),
DynamicVGPRBlockSize)
<< ')';
OS << ", SGPRs: " << RP.getSGPRNum();
if (ST)
@ -240,7 +246,7 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
<< ", LSGPR WT: " << RP.getSGPRTuplesWeight();
if (ST)
OS << " -> Occ: " << RP.getOccupancy(*ST);
OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
OS << '\n';
});
}

View File

@ -69,9 +69,11 @@ struct GCNRegPressure {
}
unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
unsigned getOccupancy(const GCNSubtarget &ST) const {
unsigned getOccupancy(const GCNSubtarget &ST,
unsigned DynamicVGPRBlockSize) const {
return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
DynamicVGPRBlockSize));
}
void inc(unsigned Reg,
@ -79,8 +81,10 @@ struct GCNRegPressure {
LaneBitmask NewMask,
const MachineRegisterInfo &MRI);
bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const {
return getOccupancy(ST) > O.getOccupancy(ST);
bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O,
unsigned DynamicVGPRBlockSize) const {
return getOccupancy(ST, DynamicVGPRBlockSize) >
O.getOccupancy(ST, DynamicVGPRBlockSize);
}
/// Compares \p this GCNRegpressure to \p O, returning true if \p this is
@ -133,7 +137,8 @@ private:
friend GCNRegPressure max(const GCNRegPressure &P1,
const GCNRegPressure &P2);
friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST);
friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST,
unsigned DynamicVGPRBlockSize);
};
inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
@ -402,7 +407,8 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
bool isEqual(const GCNRPTracker::LiveRegSet &S1,
const GCNRPTracker::LiveRegSet &S2);
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr);
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr,
unsigned DynamicVGPRBlockSize = 0);
Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
const MachineRegisterInfo &MRI);

View File

@ -99,17 +99,20 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
if (!KnownExcessRP) {
VGPRCriticalLimit =
std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
VGPRCriticalLimit = std::min(
ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
VGPRExcessLimit);
} else {
// This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
// returns a reasonably small number for targets with lots of VGPRs, such
// as GFX10 and GFX11.
LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
"VGPRCriticalLimit calculation method.\n");
unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
unsigned Granule =
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
unsigned Addressable =
AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
VGPRBudget = std::max(VGPRBudget, Granule);
VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
@ -1136,7 +1139,8 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
if (DAG.MinOccupancy > InitialOccupancy) {
for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
DAG.RegionsWithMinOcc[IDX] =
DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
DAG.Pressure[IDX].getOccupancy(
DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << StageID
<< " stage successfully increased occupancy to "
@ -1273,11 +1277,14 @@ void GCNSchedStage::checkScheduling() {
LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
DAG.Pressure[RegionIdx] = PressureAfter;
DAG.RegionsWithMinOcc[RegionIdx] =
PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
DAG.MinOccupancy;
// Early out if we have achieved the occupancy target.
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
@ -1286,10 +1293,10 @@ void GCNSchedStage::checkScheduling() {
unsigned TargetOccupancy = std::min(
S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
unsigned WavesAfter =
std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
unsigned WavesBefore =
std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
unsigned WavesAfter = std::min(
TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
unsigned WavesBefore = std::min(
TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
<< ", after " << WavesAfter << ".\n");
@ -1338,7 +1345,8 @@ void GCNSchedStage::checkScheduling() {
} else {
DAG.Pressure[RegionIdx] = PressureAfter;
DAG.RegionsWithMinOcc[RegionIdx] =
PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
DAG.MinOccupancy;
}
}
@ -1461,11 +1469,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
return true;
// For dynamic VGPR mode, we don't want to waste any VGPR blocks.
if (ST.isDynamicVGPREnabled()) {
if (DAG.MFI.isDynamicVGPREnabled()) {
unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
&ST, PressureBefore.getVGPRNum(false));
&ST, DAG.MFI.getDynamicVGPRBlockSize(),
PressureBefore.getVGPRNum(false));
unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
&ST, PressureAfter.getVGPRNum(false));
&ST, DAG.MFI.getDynamicVGPRBlockSize(),
PressureAfter.getVGPRNum(false));
if (BlocksAfter > BlocksBefore)
return true;
}
@ -1489,7 +1499,8 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
// If RP is not reduced in the unclustered reschedule stage, revert to the
// old schedule.
if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
if ((WavesAfter <=
PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
mayCauseSpilling(WavesAfter)) ||
GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
@ -1511,8 +1522,9 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
ScheduleMetrics MAfter = getScheduleMetrics(DAG);
unsigned OldMetric = MBefore.getMetric();
unsigned NewMetric = MAfter.getMetric();
unsigned WavesBefore =
std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
unsigned WavesBefore = std::min(
S.getTargetOccupancy(),
PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
unsigned Profit =
((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
@ -1566,7 +1578,8 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
void GCNSchedStage::revertScheduling() {
DAG.RegionsWithMinOcc[RegionIdx] =
PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) ==
DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
DAG.RegionEnd = DAG.RegionBegin;
int SkippedDebugInstr = 0;
@ -1844,13 +1857,16 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
// occupancy, or regions with VGPR spilling) to a model of their excess RP.
DenseMap<unsigned, ExcessRP> OptRegions;
const Function &F = MF.getFunction();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
const unsigned MaxSGPRsIncOcc =
ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
const unsigned MaxVGPRsIncOcc =
ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
auto ClearOptRegionsIf = [&](bool Cond) -> bool {
@ -2163,7 +2179,9 @@ void PreRARematStage::rematerialize() {
}
}
DAG.Pressure[I] = RP;
AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
AchievedOcc = std::min(
AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
->getDynamicVGPRBlockSize()));
}
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
}

View File

@ -366,8 +366,11 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
getGeneration());
}
unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
unsigned
GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
unsigned DynamicVGPRBlockSize) const {
return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs,
DynamicVGPRBlockSize);
}
unsigned
@ -403,9 +406,15 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
std::pair<unsigned, unsigned>
GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs, unsigned NumVGPRs) const {
unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
// Temporarily check both the attribute and the subtarget feature until the
// latter is removed.
if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
// Maximum occupancy may be further limited by high SGPR/VGPR usage.
MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
@ -512,9 +521,16 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
}
unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
// Temporarily check both the attribute and the subtarget feature, until the
// latter is removed.
unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
return getBaseMaxNumVGPRs(
F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
}
unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {

View File

@ -1391,7 +1391,8 @@ public:
/// Return the maximum number of waves per SIMD for kernels using \p VGPRs
/// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
unsigned DynamicVGPRBlockSize) const;
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p F, each workgroup
@ -1549,8 +1550,8 @@ public:
unsigned getMaxNumSGPRs(const Function &F) const;
/// \returns VGPR allocation granularity supported by the subtarget.
unsigned getVGPRAllocGranule() const {
return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
}
/// \returns VGPR encoding granularity supported by the subtarget.
@ -1570,20 +1571,24 @@ public:
}
/// \returns Addressable number of VGPRs supported by the subtarget.
unsigned getAddressableNumVGPRs() const {
return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
}
/// \returns the minimum number of VGPRs that will prevent achieving more than
/// the specified number of waves \p WavesPerEU.
unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
unsigned getMinNumVGPRs(unsigned WavesPerEU,
unsigned DynamicVGPRBlockSize) const {
return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
DynamicVGPRBlockSize);
}
/// \returns the maximum number of VGPRs that can be used and still achieved
/// at least the specified number of waves \p WavesPerEU.
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
unsigned getMaxNumVGPRs(unsigned WavesPerEU,
unsigned DynamicVGPRBlockSize) const {
return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
DynamicVGPRBlockSize);
}
/// \returns max num VGPRs. This is the common utility function
@ -1686,6 +1691,9 @@ public:
}
bool isDynamicVGPREnabled() const { return DynamicVGPR; }
unsigned getDynamicVGPRBlockSize() const {
return DynamicVGPRBlockSize32 ? 32 : 16;
}
bool requiresDisjointEarlyClobberAndUndef() const override {
// AMDGPU doesn't care if early-clobber and undef operands are allocated

View File

@ -313,13 +313,11 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
/// Remove dependency on GCNSubtarget and depend only only the necessary values
/// for said occupancy computation. Should match computeOccupancy implementation
/// without passing \p STM on.
const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc,
const MCExpr *NumSGPRs,
const MCExpr *NumVGPRs,
const GCNSubtarget &STM,
MCContext &Ctx) {
const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
unsigned Generation = STM.getGeneration();

View File

@ -93,11 +93,10 @@ public:
return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
}
static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc,
const MCExpr *NumSGPRs,
const MCExpr *NumVGPRs,
const GCNSubtarget &STM,
MCContext &Ctx);
static const AMDGPUMCExpr *
createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
const GCNSubtarget &STM, MCContext &Ctx);
ArrayRef<const MCExpr *> getArgs() const { return Args; }
VariantKind getKind() const { return Kind; }

View File

@ -197,7 +197,9 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
// pointer becomes dead and could otherwise be reused for destination.
RPT.advanceToNext();
GCNRegPressure MaxPressure = RPT.moveMaxPressure();
unsigned Occupancy = MaxPressure.getOccupancy(*ST);
unsigned Occupancy = MaxPressure.getOccupancy(
*ST,
MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
// Don't push over half the register budget. We don't want to introduce
// spilling just to form a soft clause.

View File

@ -714,11 +714,12 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
assert(hasFP(MF));
Register FPReg = MFI->getFrameOffsetReg();
assert(FPReg != AMDGPU::FP_REG);
unsigned VGPRSize =
llvm::alignTo((ST.getAddressableNumVGPRs() -
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) *
4,
FrameInfo.getMaxAlign());
unsigned VGPRSize = llvm::alignTo(
(ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) -
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST,
MFI->getDynamicVGPRBlockSize())) *
4,
FrameInfo.getMaxAlign());
MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
@ -2087,7 +2088,7 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
bool SIFrameLowering::mayReserveScratchForCWSR(
const MachineFunction &MF) const {
return MF.getSubtarget<GCNSubtarget>().isDynamicVGPREnabled() &&
return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) &&
AMDGPU::isCompute(MF.getFunction().getCallingConv());
}

View File

@ -1760,7 +1760,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
if (!WCG->isOptNone() &&
(ST->isDynamicVGPREnabled() ||
(MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
(ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
@ -2652,7 +2652,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
[[maybe_unused]] unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
[[maybe_unused]] unsigned NumVGPRsMax =
ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
[[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
@ -2821,7 +2822,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
// (i.e. whether we're in dynamic VGPR mode or not).
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
// waveslot limited kernel runs slower with the deallocation.
if (ST->isDynamicVGPREnabled()) {
if (MFI->isDynamicVGPREnabled()) {
for (MachineInstr *MI : ReleaseVGPRInsts) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_ALLOC_VGPR))
@ -2832,7 +2833,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
if (!ReleaseVGPRInsts.empty() &&
(MF.getFrameInfo().hasCalls() ||
ST->getOccupancyWithNumVGPRs(
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
/*IsDynamicVGPR=*/false) <
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
for (MachineInstr *MI : ReleaseVGPRInsts) {
if (ST->requiresNopBeforeDeallocVGPRs()) {

View File

@ -48,6 +48,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
assert(MaxNumWorkGroups.size() == 3);
// Temporarily check both the attribute and the subtarget feature, until the
// latter is completely removed.
DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
CallingConv::ID CC = F.getCallingConv();
@ -716,6 +722,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
SpillPhysVGPRS.push_back(regToString(Reg, TRI));

View File

@ -299,6 +299,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool HasInitWholeWave = false;
unsigned DynamicVGPRBlockSize = 0;
unsigned ScratchReservedForDynamicVGPRs = 0;
SIMachineFunctionInfo() = default;
@ -352,6 +353,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
StringValue());
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
MFI.ScratchReservedForDynamicVGPRs, 0);
}
@ -469,6 +471,8 @@ private:
unsigned NumSpilledSGPRs = 0;
unsigned NumSpilledVGPRs = 0;
unsigned DynamicVGPRBlockSize = 0;
// The size in bytes of the scratch space reserved for the CWSR trap handler
// to spill some of the dynamic VGPRs.
unsigned ScratchReservedForDynamicVGPRs = 0;
@ -820,6 +824,9 @@ public:
BytesInStackArgArea = Bytes;
}
bool isDynamicVGPREnabled() const { return DynamicVGPRBlockSize != 0; }
unsigned getDynamicVGPRBlockSize() const { return DynamicVGPRBlockSize; }
// This is only used if we need to save any dynamic VGPRs in scratch.
unsigned getScratchReservedForDynamicVGPRs() const {
return ScratchReservedForDynamicVGPRs;

View File

@ -3748,7 +3748,11 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
default:
return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
case AMDGPU::VGPR_32RegClassID:
return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF));
return std::min(
ST.getMaxNumVGPRs(
MinOcc,
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize()),
ST.getMaxNumVGPRs(MF));
case AMDGPU::SGPR_32RegClassID:
case AMDGPU::SGPR_LO16RegClassID:
return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));

View File

@ -1158,10 +1158,16 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
}
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
unsigned DynamicVGPRBlockSize,
std::optional<bool> EnableWavefrontSize32) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 8;
if (DynamicVGPRBlockSize != 0)
return DynamicVGPRBlockSize;
// Temporarily check the subtarget feature, until we fully switch to using
// attributes.
if (STI->getFeatureBits().test(FeatureDynamicVGPR))
return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
@ -1205,20 +1211,26 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
unsigned DynamicVGPRBlockSize) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 512;
if (STI->getFeatureBits().test(FeatureDynamicVGPR))
// Temporarily check the subtarget feature, until we fully switch to using
// attributes.
if (DynamicVGPRBlockSize != 0 ||
STI->getFeatureBits().test(FeatureDynamicVGPR))
// On GFX12 we can allocate at most 8 blocks of VGPRs.
return 8 * getVGPRAllocGranule(STI);
return 8 * getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
return getAddressableNumArchVGPRs(STI);
}
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
unsigned NumVGPRs) {
return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI),
getMaxWavesPerEU(STI),
getTotalNumVGPRs(STI));
unsigned NumVGPRs,
unsigned DynamicVGPRBlockSize) {
return getNumWavesPerEUWithNumVGPRs(
NumVGPRs, getVGPRAllocGranule(STI, DynamicVGPRBlockSize),
getMaxWavesPerEU(STI), getTotalNumVGPRs(STI));
}
unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
@ -1257,7 +1269,8 @@ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
return 5;
}
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
unsigned DynamicVGPRBlockSize) {
assert(WavesPerEU != 0);
unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
@ -1265,28 +1278,33 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
return 0;
unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI);
unsigned Granule = getVGPRAllocGranule(STI);
unsigned AddrsableNumVGPRs =
getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
unsigned Granule = getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);
if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
return 0;
unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs);
unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs,
DynamicVGPRBlockSize);
if (WavesPerEU < MinWavesPerEU)
return getMinNumVGPRs(STI, MinWavesPerEU);
return getMinNumVGPRs(STI, MinWavesPerEU, DynamicVGPRBlockSize);
unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
return std::min(MinNumVGPRs, AddrsableNumVGPRs);
}
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
unsigned DynamicVGPRBlockSize) {
assert(WavesPerEU != 0);
unsigned MaxNumVGPRs =
alignDown(getTotalNumVGPRs(STI) / WavesPerEU, getVGPRAllocGranule(STI));
unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI);
alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
getVGPRAllocGranule(STI, DynamicVGPRBlockSize));
unsigned AddressableNumVGPRs =
getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
}
@ -1299,9 +1317,11 @@ unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI,
unsigned NumVGPRs,
unsigned DynamicVGPRBlockSize,
std::optional<bool> EnableWavefrontSize32) {
return getGranulatedNumRegisterBlocks(
NumVGPRs, getVGPRAllocGranule(STI, EnableWavefrontSize32));
NumVGPRs,
getVGPRAllocGranule(STI, DynamicVGPRBlockSize, EnableWavefrontSize32));
}
} // end namespace IsaInfo
@ -2124,6 +2144,16 @@ bool getHasDepthExport(const Function &F) {
return F.getFnAttributeAsParsedInteger("amdgpu-depth-export", 0) != 0;
}
unsigned getDynamicVGPRBlockSize(const Function &F) {
unsigned BlockSize =
F.getFnAttributeAsParsedInteger("amdgpu-dynamic-vgpr-block-size", 0);
if (BlockSize == 16 || BlockSize == 32)
return BlockSize;
return 0;
}
bool hasXNACK(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureXNACK);
}

View File

@ -298,7 +298,7 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
/// For subtargets which support it, \p EnableWavefrontSize32 should match
/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
unsigned
getVGPRAllocGranule(const MCSubtargetInfo *STI,
getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize,
std::optional<bool> EnableWavefrontSize32 = std::nullopt);
/// \returns VGPR encoding granularity for given subtarget \p STI.
@ -321,20 +321,24 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);
/// \returns Addressable number of VGPRs for given subtarget \p STI.
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
unsigned DynamicVGPRBlockSize);
/// \returns Minimum number of VGPRs that meets given number of waves per
/// execution unit requirement for given subtarget \p STI.
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
unsigned DynamicVGPRBlockSize);
/// \returns Maximum number of VGPRs that meets given number of waves per
/// execution unit requirement for given subtarget \p STI.
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
unsigned DynamicVGPRBlockSize);
/// \returns Number of waves reachable for a given \p NumVGPRs usage for given
/// subtarget \p STI.
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
unsigned NumVGPRs);
unsigned NumVGPRs,
unsigned DynamicVGPRBlockSize);
/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
@ -361,6 +365,7 @@ unsigned getEncodedNumVGPRBlocks(
/// subtarget \p STI when \p NumVGPRs are used.
unsigned getAllocatedNumVGPRBlocks(
const MCSubtargetInfo *STI, unsigned NumVGPRs,
unsigned DynamicVGPRBlockSize,
std::optional<bool> EnableWavefrontSize32 = std::nullopt);
} // end namespace IsaInfo
@ -1305,6 +1310,12 @@ bool getHasColorExport(const Function &F);
bool getHasDepthExport(const Function &F);
bool hasDynamicVGPR(const Function &F);
// Returns the value of the "amdgpu-dynamic-vgpr-block-size" attribute, or 0 if
// the attribute is missing or its value is invalid.
unsigned getDynamicVGPRBlockSize(const Function &F);
LLVM_READNONE
constexpr bool isShader(CallingConv::ID CC) {
switch (CC) {

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s
; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
@ -137,7 +137,7 @@ define amdgpu_cs void @with_calls_no_inline_const() #0 {
ret void
}
define amdgpu_cs void @with_spills() {
define amdgpu_cs void @with_spills() #0 {
; CHECK-LABEL: with_spills:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
@ -366,6 +366,6 @@ define void @default() #0 {
declare amdgpu_gfx void @callee(i32) #0
attributes #0 = { nounwind }
attributes #1 = { nounwind "frame-pointer"="none" }
attributes #2 = { nounwind "frame-pointer"="all" }
attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
attributes #1 = { nounwind "frame-pointer"="none" "amdgpu-dynamic-vgpr-block-size"="16" }
attributes #2 = { nounwind "frame-pointer"="all" "amdgpu-dynamic-vgpr-block-size"="16" }

View File

@ -1,4 +1,4 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
@ -68,5 +68,5 @@ define void @default() #0 {
declare amdgpu_gfx void @callee(i32) #0
attributes #0 = { nounwind }
attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size" = "16" }

View File

@ -0,0 +1,305 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
; CHECK-NEXT: amdpal.pipelines:
; CHECK-NEXT: - .api: Vulkan
; CHECK-NEXT: .compute_registers:
; CHECK-NEXT: .dynamic_vgpr_en: true
; CHECK-NEXT: .tg_size_en: true
; CHECK-NEXT: .tgid_x_en: false
; CHECK-NEXT: .tgid_y_en: false
; CHECK-NEXT: .tgid_z_en: false
; CHECK-NEXT: .tidig_comp_cnt: 0x1
; CHECK-NEXT: .hardware_stages:
; CHECK-NEXT: .cs:
; CHECK-NEXT: .checksum_value: 0x9444d7d0
; CHECK-NEXT: .debug_mode: 0
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0x200
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .sgpr_limit: 0x6a
; CHECK-NEXT: .threadgroup_dimensions:
; CHECK-NEXT: - 0x1
; CHECK-NEXT: - 0x400
; CHECK-NEXT: - 0x1
; CHECK-NEXT: .trap_present: false
; CHECK-NEXT: .user_data_reg_map:
; CHECK-NEXT: - 0x10000000
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: .user_sgprs: 0x3
; CHECK-NEXT: .vgpr_limit: 0x100
; CHECK-NEXT: .wavefront_size: 0x40
; CHECK-NEXT: .wgp_mode: true
; CHECK: .registers: {}
; CHECK-NEXT: .shader_functions:
; CHECK-NEXT: dynamic_stack:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: dynamic_stack_loop:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: multiple_stack:
; CHECK-NEXT: .backend_stack_size: 0x24
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: no_stack:
; CHECK-NEXT: .backend_stack_size: 0
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: no_stack_call:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x3
; CHECK-NEXT: no_stack_extern_call:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: no_stack_extern_call_many_args:
; CHECK-NEXT: .backend_stack_size: 0x90
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x90
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: no_stack_indirect_call:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: simple_lds:
; CHECK-NEXT: .backend_stack_size: 0
; CHECK-NEXT: .lds_size: 0x100
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: simple_lds_recurse:
; CHECK-NEXT: .backend_stack_size: 0x10
; CHECK-NEXT: .lds_size: 0x100
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
; CHECK-NEXT: .vgpr_count: 0x29
; CHECK-NEXT: simple_stack:
; CHECK-NEXT: .backend_stack_size: 0x14
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: simple_stack_call:
; CHECK-NEXT: .backend_stack_size: 0x20
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x4
; CHECK-NEXT: simple_stack_extern_call:
; CHECK-NEXT: .backend_stack_size: 0x20
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: simple_stack_indirect_call:
; CHECK-NEXT: .backend_stack_size: 0x20
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x58
; CHECK-NEXT: simple_stack_recurse:
; CHECK-NEXT: .backend_stack_size: 0x20
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .sgpr_count: 0x24
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
; CHECK-NEXT: .vgpr_count: 0x2a
; CHECK:amdpal.version:
; CHECK-NEXT: - 0x3
; CHECK-NEXT: - 0
; CHECK-NEXT:...
; CHECK-NEXT: .end_amdgpu_pal_metadata
declare amdgpu_gfx float @extern_func(float) #0
declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4
define amdgpu_gfx float @no_stack(float %arg0) #0 {
%add = fadd float %arg0, 1.0
ret float %add
}
define amdgpu_gfx float @simple_stack(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
ret float %add
}
define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
%stack2 = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack2
%val2 = load volatile float, ptr addrspace(5) %stack2
%add2 = fadd float %add, %val2
ret float %add2
}
define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
bb0:
%cmp = fcmp ogt float %arg0, 0.0
br i1 %cmp, label %bb1, label %bb2
bb1:
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
br label %bb2
bb2:
%res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
ret float %res
}
define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
bb0:
br label %bb1
bb1:
%ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%add = fadd float %arg0, %val
%cmp = icmp sgt i32 %ctr, 0
%newctr = sub i32 %ctr, 1
br i1 %cmp, label %bb1, label %bb2
bb2:
ret float %add
}
define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
%res = call amdgpu_gfx float @simple_stack(float %arg0)
ret float %res
}
define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%res = call amdgpu_gfx float @simple_stack(float %arg0)
%add = fadd float %res, %val
ret float %add
}
define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
%res = call amdgpu_gfx float @extern_func(float %arg0)
ret float %res
}
define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%res = call amdgpu_gfx float @extern_func(float %arg0)
%add = fadd float %res, %val
ret float %add
}
define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
%res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
ret float %res
}
define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
%fptr = load ptr, ptr addrspace(4) @funcptr
call amdgpu_gfx void %fptr()
ret float %arg0
}
define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%fptr = load ptr, ptr addrspace(4) @funcptr
call amdgpu_gfx void %fptr()
%add = fadd float %arg0, %val
ret float %add
}
define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
%stack = alloca float, i32 4, align 4, addrspace(5)
store volatile float 2.0, ptr addrspace(5) %stack
%val = load volatile float, ptr addrspace(5) %stack
%res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
%add = fadd float %res, %val
ret float %add
}
@lds = internal addrspace(3) global [64 x float] poison
define amdgpu_gfx float @simple_lds(float %arg0) #0 {
%val = load float, ptr addrspace(3) @lds
ret float %val
}
define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
%val = load float, ptr addrspace(3) @lds
%res = call amdgpu_gfx float @simple_lds_recurse(float %val)
ret float %res
}
attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
!amdgpu.pal.metadata.msgpack = !{!0}
!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
!1 = !{i32 7}

View File

@ -0,0 +1,205 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s
; CHECK-LABEL: {{^}}_amdgpu_cs_main:
; CHECK: ; TotalNumSgprs: 34
; CHECK: ; NumVgprs: 2
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
; CHECK-NEXT: amdpal.pipelines:
; CHECK-NEXT: - .api: Vulkan
; CHECK-NEXT: .compute_registers:
; CHECK-NEXT: .dynamic_vgpr_en: true
; CHECK-NEXT: .tg_size_en: true
; CHECK-NEXT: .tgid_x_en: false
; CHECK-NEXT: .tgid_y_en: false
; CHECK-NEXT: .tgid_z_en: false
; CHECK-NEXT: .tidig_comp_cnt: 0x1
; CHECK-NEXT: .graphics_registers:
; CHECK-NEXT: .ps_extra_lds_size: 0
; CHECK-NEXT: .spi_ps_input_addr:
; CHECK-NEXT: .ancillary_ena: false
; CHECK-NEXT: .front_face_ena: true
; CHECK-NEXT: .line_stipple_tex_ena: false
; CHECK-NEXT: .linear_center_ena: true
; CHECK-NEXT: .linear_centroid_ena: true
; CHECK-NEXT: .linear_sample_ena: true
; CHECK-NEXT: .persp_center_ena: true
; CHECK-NEXT: .persp_centroid_ena: true
; CHECK-NEXT: .persp_pull_model_ena: false
; CHECK-NEXT: .persp_sample_ena: true
; CHECK-NEXT: .pos_fixed_pt_ena: true
; CHECK-NEXT: .pos_w_float_ena: false
; CHECK-NEXT: .pos_x_float_ena: false
; CHECK-NEXT: .pos_y_float_ena: false
; CHECK-NEXT: .pos_z_float_ena: false
; CHECK-NEXT: .sample_coverage_ena: false
; CHECK-NEXT: .spi_ps_input_ena:
; CHECK-NEXT: .ancillary_ena: false
; CHECK-NEXT: .front_face_ena: false
; CHECK-NEXT: .line_stipple_tex_ena: false
; CHECK-NEXT: .linear_center_ena: false
; CHECK-NEXT: .linear_centroid_ena: false
; CHECK-NEXT: .linear_sample_ena: false
; CHECK-NEXT: .persp_center_ena: false
; CHECK-NEXT: .persp_centroid_ena: false
; CHECK-NEXT: .persp_pull_model_ena: false
; CHECK-NEXT: .persp_sample_ena: true
; CHECK-NEXT: .pos_fixed_pt_ena: false
; CHECK-NEXT: .pos_w_float_ena: false
; CHECK-NEXT: .pos_x_float_ena: false
; CHECK-NEXT: .pos_y_float_ena: false
; CHECK-NEXT: .pos_z_float_ena: false
; CHECK-NEXT: .sample_coverage_ena: false
; CHECK-NEXT: .hardware_stages:
; CHECK-NEXT: .cs:
; CHECK-NEXT: .checksum_value: 0x9444d7d0
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .dynamic_vgpr_saved_count: 0x70
; CHECK-NEXT: .entry_point: _amdgpu_cs
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x22
; CHECK-NEXT: .sgpr_limit: 0x6a
; CHECK-NEXT: .threadgroup_dimensions:
; CHECK-NEXT: - 0x1
; CHECK-NEXT: - 0x400
; CHECK-NEXT: - 0x1
; CHECK-NEXT: .trap_present: false
; CHECK-NEXT: .user_data_reg_map:
; CHECK-NEXT: - 0x10000000
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: .user_sgprs: 0x3
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: .vgpr_limit: 0x100
; CHECK-NEXT: .wavefront_size: 0x40
; CHECK-NEXT: .wgp_mode: false
; CHECK-NEXT: .gs:
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_gs
; CHECK-NEXT: .entry_point_symbol: gs_shader
; CHECK-NEXT: .lds_size: 0x200
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: .wgp_mode: true
; CHECK-NEXT: .hs:
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_hs
; CHECK-NEXT: .entry_point_symbol: hs_shader
; CHECK-NEXT: .lds_size: 0x1000
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: .wgp_mode: true
; CHECK-NEXT: .ps:
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_ps
; CHECK-NEXT: .entry_point_symbol: ps_shader
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x1
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: .wgp_mode: true
; CHECK: .registers: {}
; CHECK:amdpal.version:
; CHECK-NEXT: - 0x3
; CHECK-NEXT: - 0
; CHECK-NEXT:...
; CHECK-NEXT: .end_amdgpu_pal_metadata
define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
.entry:
%i = call i64 @llvm.amdgcn.s.getpc()
%i1 = and i64 %i, -4294967296
%i2 = zext i32 %arg1 to i64
%i3 = or i64 %i1, %i2
%i4 = inttoptr i64 %i3 to ptr addrspace(4)
%i5 = and i32 %arg2, 1023
%i6 = lshr i32 %arg2, 10
%i7 = and i32 %i6, 1023
%i8 = add nuw nsw i32 %i7, %i5
%i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
%.idx = shl nuw nsw i32 %i8, 2
call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
ret void
}
define dllexport amdgpu_ps void @ps_shader() #1 {
ret void
}
@LDS.GS = external addrspace(3) global [1 x i32], align 4
define dllexport amdgpu_gs void @gs_shader() #2 {
%ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
store i32 0, ptr addrspace(3) %ptr, align 4
ret void
}
@LDS.HS = external addrspace(3) global [1024 x i32], align 4
define dllexport amdgpu_hs void @hs_shader() #2 {
%ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
store i32 0, ptr addrspace(3) %ptr, align 4
ret void
}
!amdgpu.pal.metadata.msgpack = !{!0}
; Function Attrs: nounwind willreturn memory(none)
declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.amdgcn.s.getpc() #2
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode" "amdgpu-dynamic-vgpr-block-size"="16" }
attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
!1 = !{i32 7}

View File

@ -0,0 +1,340 @@
# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
--- |
define amdgpu_ps void @tbuffer_store1() #0 { ret void }
define amdgpu_ps void @tbuffer_store2() #0 { ret void }
define amdgpu_ps void @flat_store() #0 { ret void }
define amdgpu_ps void @global_store() #0 { ret void }
define amdgpu_ps void @buffer_store_format() #0 { ret void }
define amdgpu_ps void @ds_write_b32() #0 { ret void }
define amdgpu_ps void @global_store_dword() #0 { ret void }
define amdgpu_ps void @multiple_basic_blocks1() #0 { ret void }
define amdgpu_ps void @multiple_basic_blocks2() #0 { ret void }
define amdgpu_ps void @multiple_basic_blocks3() #0 { ret void }
define amdgpu_ps void @recursive_loop() #0 { ret void }
define amdgpu_ps void @recursive_loop_vmem() #0 { ret void }
define amdgpu_ps void @image_store() #0 { ret void }
define amdgpu_ps void @scratch_store() #0 { ret void }
define amdgpu_ps void @buffer_atomic() #0 { ret void }
define amdgpu_ps void @flat_atomic() #0 { ret void }
define amdgpu_ps void @global_atomic() #0 { ret void }
define amdgpu_ps void @image_atomic() #0 { ret void }
define amdgpu_ps void @global_store_optnone() #1 { ret void }
attributes #0 = { "amdgpu-dynamic-vgpr-block-size" = "16" }
attributes #1 = { "amdgpu-dynamic-vgpr-block-size" = "16" noinline optnone }
...
---
name: tbuffer_store1
body: |
bb.0:
; CHECK-LABEL: name: tbuffer_store1
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: tbuffer_store2
body: |
bb.0:
; CHECK-LABEL: name: tbuffer_store2
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
S_ENDPGM 0
...
---
name: flat_store
body: |
bb.0:
; CHECK-LABEL: name: flat_store
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: global_store
body: |
bb.0:
; CHECK-LABEL: name: global_store
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
S_WAIT_STORECNT 0
S_ENDPGM 0
...
---
name: buffer_store_format
body: |
bb.0:
; CHECK-LABEL: name: buffer_store_format
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: ds_write_b32
body: |
bb.0:
; CHECK-LABEL: name: ds_write_b32
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $vgpr0 = IMPLICIT_DEF
renamable $vgpr1 = IMPLICIT_DEF
DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0
S_ENDPGM 0
...
---
name: global_store_dword
body: |
bb.0:
liveins: $vgpr0, $sgpr0_sgpr1
; CHECK-LABEL: name: global_store_dword
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
S_ENDPGM 0
...
---
name: multiple_basic_blocks1
body: |
; CHECK-LABEL: name: multiple_basic_blocks1
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
name: multiple_basic_blocks2
body: |
; CHECK-LABEL: name: multiple_basic_blocks2
; CHECK: bb.2:
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.2
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_BRANCH %bb.2
bb.1:
successors: %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
name: multiple_basic_blocks3
body: |
; CHECK-LABEL: name: multiple_basic_blocks3
; CHECK: bb.4:
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
S_BRANCH %bb.2
bb.1:
successors: %bb.2
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_BRANCH %bb.2
bb.2:
successors: %bb.4
S_BRANCH %bb.4
bb.3:
successors: %bb.4
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
S_BRANCH %bb.4
bb.4:
S_ENDPGM 0
...
---
name: recursive_loop
body: |
; CHECK-LABEL: name: recursive_loop
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
name: recursive_loop_vmem
body: |
; CHECK-LABEL: name: recursive_loop_vmem
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
S_BRANCH %bb.1
bb.1:
successors: %bb.1, %bb.2
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
S_BRANCH %bb.2
bb.2:
S_ENDPGM 0
...
---
name: image_store
body: |
bb.0:
; CHECK-LABEL: name: image_store
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
S_ENDPGM 0
...
---
name: scratch_store
body: |
bb.0:
; CHECK-LABEL: name: scratch_store
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: buffer_atomic
body: |
bb.0:
; CHECK-LABEL: name: buffer_atomic
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
S_ENDPGM 0
...
---
name: flat_atomic
body: |
bb.0:
; CHECK-LABEL: name: flat_atomic
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
---
name: global_atomic
body: |
bb.0:
; CHECK-LABEL: name: global_atomic
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
S_ENDPGM 0
...
---
name: image_atomic
body: |
bb.0:
; CHECK-LABEL: name: image_atomic
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ALLOC_VGPR 0
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
S_ENDPGM 0
...
---
name: global_store_optnone
body: |
bb.0:
; CHECK-LABEL: name: global_store_optnone
; CHECK-NOT: S_SENDMSG 3
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
S_WAIT_STORECNT 0
S_ENDPGM 0
...

View File

@ -1,5 +1,4 @@
# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
# RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
--- |
define amdgpu_ps void @tbuffer_store1() { ret void }
@ -29,8 +28,7 @@ body: |
bb.0:
; CHECK-LABEL: name: tbuffer_store1
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
S_ENDPGM 0
@ -42,8 +40,7 @@ body: |
bb.0:
; CHECK-LABEL: name: tbuffer_store2
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
S_ENDPGM 0
@ -55,8 +52,7 @@ body: |
bb.0:
; CHECK-LABEL: name: flat_store
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
@ -68,8 +64,7 @@ body: |
bb.0:
; CHECK-LABEL: name: global_store
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
S_WAIT_STORECNT 0
@ -82,8 +77,7 @@ body: |
bb.0:
; CHECK-LABEL: name: buffer_store_format
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
S_ENDPGM 0
@ -95,8 +89,7 @@ body: |
bb.0:
; CHECK-LABEL: name: ds_write_b32
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $vgpr0 = IMPLICIT_DEF
renamable $vgpr1 = IMPLICIT_DEF
@ -112,8 +105,7 @@ body: |
; CHECK-LABEL: name: global_store_dword
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
@ -125,8 +117,7 @@ name: multiple_basic_blocks1
body: |
; CHECK-LABEL: name: multiple_basic_blocks1
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
@ -153,8 +144,7 @@ body: |
; CHECK-LABEL: name: multiple_basic_blocks2
; CHECK: bb.2:
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.2
@ -180,8 +170,7 @@ body: |
; CHECK-LABEL: name: multiple_basic_blocks3
; CHECK: bb.4:
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.2
@ -216,8 +205,7 @@ name: recursive_loop
body: |
; CHECK-LABEL: name: recursive_loop
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
@ -241,8 +229,7 @@ name: recursive_loop_vmem
body: |
; CHECK-LABEL: name: recursive_loop_vmem
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
@ -268,8 +255,7 @@ body: |
bb.0:
; CHECK-LABEL: name: image_store
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
S_ENDPGM 0
@ -281,8 +267,7 @@ body: |
bb.0:
; CHECK-LABEL: name: scratch_store
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
@ -295,8 +280,7 @@ body: |
bb.0:
; CHECK-LABEL: name: buffer_atomic
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
S_ENDPGM 0
@ -308,8 +292,7 @@ body: |
bb.0:
; CHECK-LABEL: name: flat_atomic
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
S_ENDPGM 0
@ -322,8 +305,7 @@ body: |
bb.0:
; CHECK-LABEL: name: global_atomic
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
S_ENDPGM 0
@ -335,8 +317,7 @@ body: |
bb.0:
; CHECK-LABEL: name: image_atomic
; CHECK-NOT: S_SENDMSG 3
; DEFAULT-NOT: S_ALLOC_VGPR
; DVGPR: S_ALLOC_VGPR 0
; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
S_ENDPGM 0

View File

@ -44,6 +44,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
@ -312,6 +313,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {

View File

@ -44,6 +44,7 @@
; AFTER-PEI-NEXT: sgprForEXECCopy: ''
; AFTER-PEI-NEXT: longBranchReservedReg: ''
; AFTER-PEI-NEXT: hasInitWholeWave: false
; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0
; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
; AFTER-PEI-NEXT: body:
define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {

View File

@ -44,6 +44,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {

View File

@ -44,6 +44,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {

View File

@ -53,6 +53,7 @@
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body:
@ -159,6 +160,7 @@ body: |
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body:
@ -236,6 +238,7 @@ body: |
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body:
@ -314,6 +317,7 @@ body: |
# FULL-NEXT: sgprForEXECCopy: ''
# FULL-NEXT: longBranchReservedReg: ''
# FULL-NEXT: hasInitWholeWave: false
# FULL-NEXT: dynamicVGPRBlockSize: 0
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
# FULL-NEXT: body:

View File

@ -54,6 +54,7 @@
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
@ -102,6 +103,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
@ -174,6 +176,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define void @function() {
@ -228,6 +231,7 @@ define void @function() {
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
; CHECK-NEXT: longBranchReservedReg: ''
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: dynamicVGPRBlockSize: 0
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
; CHECK-NEXT: body:
define void @function_nsz() #0 {

View File

@ -93,16 +93,16 @@ static const std::pair<StringRef, StringRef>
W32FS = {"+wavefrontsize32", "w32"},
W64FS = {"+wavefrontsize64", "w64"};
using TestFuncTy =
function_ref<bool(std::stringstream &, unsigned, const GCNSubtarget &)>;
using TestFuncTy = function_ref<bool(std::stringstream &, unsigned,
const GCNSubtarget &, bool)>;
static bool testAndRecord(std::stringstream &Table, const GCNSubtarget &ST,
TestFuncTy test) {
TestFuncTy test, unsigned DynamicVGPRBlockSize) {
bool Success = true;
unsigned MaxOcc = ST.getMaxWavesPerEU();
for (unsigned Occ = MaxOcc; Occ > 0; --Occ) {
Table << std::right << std::setw(3) << Occ << " ";
Success = test(Table, Occ, ST) && Success;
Success = test(Table, Occ, ST, DynamicVGPRBlockSize) && Success;
Table << '\n';
}
return Success;
@ -132,7 +132,7 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
FS = &W32FS;
std::stringstream Table;
bool Success = testAndRecord(Table, ST, test);
bool Success = testAndRecord(Table, ST, test, /*DynamicVGPRBlockSize=*/0);
if (!Success || PrintCpuRegLimits)
TablePerCPUs[Table.str()].push_back((CanonCPUName + FS->second).str());
@ -155,40 +155,50 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
TestFuncTy test) {
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
"+dynamic-vgpr," + FS.str());
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
ASSERT_TRUE(TM) << "No target machine";
GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
std::string(TM->getTargetFeatureString()), *TM);
ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
std::stringstream Table;
bool Success = testAndRecord(Table, ST, test);
EXPECT_TRUE(Success && !PrintCpuRegLimits)
<< CPUName << " dynamic VGPR " << FS
<< ":\nOcc MinVGPR MaxVGPR\n"
<< Table.str() << '\n';
auto testWithBlockSize = [&](unsigned DynamicVGPRBlockSize) {
std::stringstream Table;
bool Success = testAndRecord(Table, ST, test, DynamicVGPRBlockSize);
EXPECT_TRUE(Success && !PrintCpuRegLimits)
<< CPUName << " dynamic VGPR block size " << DynamicVGPRBlockSize
<< ":\nOcc MinVGPR MaxVGPR\n"
<< Table.str() << '\n';
};
testWithBlockSize(16);
testWithBlockSize(32);
}
TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST,
unsigned DynamicVGPRBlockSize) {
unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(DynamicVGPRBlockSize);
return checkMinMax(
OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum), ST.getMaxWavesPerEU(),
[&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs); },
[&](unsigned Occ) { return ST.getMinNumVGPRs(Occ); },
[&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ); });
OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum, DynamicVGPRBlockSize),
ST.getMaxWavesPerEU(),
[&](unsigned NumGPRs) {
return ST.getOccupancyWithNumVGPRs(NumGPRs, DynamicVGPRBlockSize);
},
[&](unsigned Occ) {
return ST.getMinNumVGPRs(Occ, DynamicVGPRBlockSize);
},
[&](unsigned Occ) {
return ST.getMaxNumVGPRs(Occ, DynamicVGPRBlockSize);
});
};
testGPRLimits("VGPR", true, test);
testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
testDynamicVGPRLimits("gfx1200",
"+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
}
static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
unsigned DynamicVGPRBlockSize,
unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
unsigned ExpectedMaxVGPRs) {
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
@ -206,11 +216,15 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
std::string DVGPRBlockSize = std::to_string(DynamicVGPRBlockSize);
if (DynamicVGPRBlockSize)
Func->addFnAttr("amdgpu-dynamic-vgpr-block-size", DVGPRBlockSize);
auto Range = ST.getWavesPerEU(*Func);
EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs(DynamicVGPRBlockSize))
<< CPUName << ' ' << FS;
// Function with requested 'amdgpu-waves-per-eu' in a valid range.
@ -221,11 +235,10 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
}
TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
testAbsoluteLimits(
"gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
1, 16, 256);
// CPUName, Features, DynamicVGPRBlockSize; Expected MinOcc, MaxOcc, MaxVGPRs
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 0, 1, 16, 256);
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 16, 1, 16, 128);
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 32, 1, 16, 256);
}
static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {