[AMDGPU] Replace dynamic VGPR feature with attribute (#133444)
Use a function attribute (amdgpu-dynamic-vgpr) instead of a subtarget feature, as requested in #130030.
This commit is contained in:
parent
6cfa03f1f1
commit
a201f8872a
@ -768,11 +768,6 @@ For example:
|
||||
performant than code generated for XNACK replay
|
||||
disabled.
|
||||
|
||||
dynamic-vgpr TODO Represents the "Dynamic VGPR" hardware mode, introduced in GFX12.
|
||||
Waves launched in this mode may allocate or deallocate the VGPRs
|
||||
using dedicated instructions, but may not send the DEALLOC_VGPRS
|
||||
message.
|
||||
|
||||
=============== ============================ ==================================================
|
||||
|
||||
.. _amdgpu-target-id:
|
||||
@ -1764,6 +1759,15 @@ The AMDGPU backend supports the following LLVM IR attributes.
|
||||
|
||||
"amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors.
|
||||
|
||||
"amdgpu-dynamic-vgpr-block-size" Represents the size of a VGPR block in the "Dynamic VGPR" hardware mode,
|
||||
introduced in GFX12.
|
||||
A value of 0 (default) means that dynamic VGPRs are not enabled.
|
||||
Valid values for GFX12+ are 16 and 32.
|
||||
Waves launched in this mode may allocate or deallocate the VGPRs
|
||||
using dedicated instructions, but may not send the DEALLOC_VGPRS
|
||||
message. If a shader has this attribute, then all its callees must
|
||||
match its value.
|
||||
|
||||
================================================ ==========================================================
|
||||
|
||||
Calling Conventions
|
||||
|
@ -1281,12 +1281,14 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
|
||||
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
|
||||
>;
|
||||
|
||||
// FIXME: Remove after all users are migrated to attribute.
|
||||
def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
|
||||
"DynamicVGPR",
|
||||
"true",
|
||||
"Enable dynamic VGPR mode"
|
||||
>;
|
||||
|
||||
// FIXME: Remove after all users are migrated to attribute.
|
||||
def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
|
||||
"DynamicVGPRBlockSize32",
|
||||
"true",
|
||||
|
@ -452,15 +452,17 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
|
||||
unsigned MaxWaves = MFI.getMaxWavesPerEU();
|
||||
uint64_t TotalNumVgpr =
|
||||
getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
|
||||
uint64_t NumVGPRsForWavesPerEU = std::max(
|
||||
{TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
|
||||
uint64_t NumVGPRsForWavesPerEU =
|
||||
std::max({TotalNumVgpr, (uint64_t)1,
|
||||
(uint64_t)STM.getMinNumVGPRs(
|
||||
MaxWaves, MFI.getDynamicVGPRBlockSize())});
|
||||
uint64_t NumSGPRsForWavesPerEU = std::max(
|
||||
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
|
||||
const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
|
||||
STM.getOccupancyWithWorkGroupSizes(*MF).second,
|
||||
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
|
||||
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
|
||||
OutContext);
|
||||
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
|
||||
MFI.getDynamicVGPRBlockSize(), STM, OutContext);
|
||||
uint64_t Occupancy;
|
||||
|
||||
const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
|
||||
@ -1082,7 +1084,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
Ctx);
|
||||
ProgInfo.NumVGPRsForWavesPerEU =
|
||||
AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
|
||||
CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
|
||||
CreateExpr(STM.getMinNumVGPRs(
|
||||
MaxWaves, MFI->getDynamicVGPRBlockSize()))},
|
||||
Ctx);
|
||||
|
||||
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
|
||||
@ -1256,7 +1259,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
|
||||
ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
|
||||
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
|
||||
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
|
||||
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
|
||||
MFI->getDynamicVGPRBlockSize(), STM, Ctx);
|
||||
|
||||
const auto [MinWEU, MaxWEU] =
|
||||
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
|
||||
@ -1405,7 +1409,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
|
||||
// Helper function to add common PAL Metadata 3.0+
|
||||
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
|
||||
const SIProgramInfo &CurrentProgramInfo,
|
||||
CallingConv::ID CC, const GCNSubtarget &ST) {
|
||||
CallingConv::ID CC, const GCNSubtarget &ST,
|
||||
unsigned DynamicVGPRBlockSize) {
|
||||
if (ST.hasIEEEMode())
|
||||
MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
|
||||
|
||||
@ -1417,7 +1422,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
|
||||
(bool)CurrentProgramInfo.TrapHandlerEnable);
|
||||
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
|
||||
|
||||
if (ST.isDynamicVGPREnabled())
|
||||
if (DynamicVGPRBlockSize != 0)
|
||||
MD->setComputeRegisters(".dynamic_vgpr_en", true);
|
||||
}
|
||||
|
||||
@ -1444,7 +1449,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
|
||||
// For targets that support dynamic VGPRs, set the number of saved dynamic
|
||||
// VGPRs (if any) in the PAL metadata.
|
||||
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
|
||||
if (STM.isDynamicVGPREnabled() &&
|
||||
if (MFI->isDynamicVGPREnabled() &&
|
||||
MFI->getScratchReservedForDynamicVGPRs() > 0)
|
||||
MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
|
||||
MFI->getScratchReservedForDynamicVGPRs() / 4);
|
||||
@ -1470,7 +1475,8 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
|
||||
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
|
||||
MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
|
||||
CurrentProgramInfo.ScratchEnable);
|
||||
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
|
||||
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
|
||||
MFI->getDynamicVGPRBlockSize());
|
||||
}
|
||||
|
||||
// ScratchSize is in bytes, 16 aligned.
|
||||
@ -1541,7 +1547,9 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
|
||||
MD->setRsrc2(CallingConv::AMDGPU_CS,
|
||||
CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
|
||||
} else {
|
||||
EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
|
||||
EmitPALMetadataCommon(
|
||||
MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
|
||||
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
|
||||
}
|
||||
|
||||
// Set optional info
|
||||
|
@ -173,8 +173,16 @@ static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
|
||||
return 128;
|
||||
|
||||
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
|
||||
|
||||
unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
|
||||
// Temporarily check both the attribute and the subtarget feature, until the
|
||||
// latter is removed.
|
||||
if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
|
||||
DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
|
||||
|
||||
unsigned MaxVGPRs = ST.getMaxNumVGPRs(
|
||||
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first);
|
||||
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
|
||||
DynamicVGPRBlockSize);
|
||||
|
||||
// A non-entry function has only 32 caller preserved registers.
|
||||
// Do not promote alloca which will force spilling unless we know the function
|
||||
|
@ -448,7 +448,10 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
|
||||
unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
|
||||
// TODO: assert Regions are sorted descending by pressure
|
||||
const auto &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
|
||||
const unsigned DynamicVGPRBlockSize =
|
||||
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
|
||||
const auto Occ =
|
||||
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
|
||||
LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
|
||||
<< ", current = " << Occ << '\n');
|
||||
|
||||
@ -457,7 +460,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
|
||||
// Always build the DAG to add mutations
|
||||
BuildDAG DAG(*R, *this);
|
||||
|
||||
if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
|
||||
if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc)
|
||||
continue;
|
||||
|
||||
LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
|
||||
@ -468,7 +471,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
|
||||
LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
|
||||
printSchedRP(dbgs(), R->MaxPressure, MaxRP));
|
||||
|
||||
NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
|
||||
NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize));
|
||||
if (NewOcc <= Occ)
|
||||
break;
|
||||
|
||||
@ -489,9 +492,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
|
||||
const auto &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
auto TgtOcc = MFI->getMinAllowedOccupancy();
|
||||
unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
|
||||
|
||||
sortRegionsByPressure(TgtOcc);
|
||||
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
|
||||
auto Occ =
|
||||
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
|
||||
|
||||
bool IsReentry = false;
|
||||
if (TryMaximizeOccupancy && Occ < TgtOcc) {
|
||||
@ -522,19 +527,21 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
|
||||
const auto RP = getRegionPressure(*R);
|
||||
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
|
||||
|
||||
if (RP.getOccupancy(ST) < TgtOcc) {
|
||||
if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
|
||||
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
|
||||
if (R->BestSchedule.get() &&
|
||||
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
|
||||
if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
|
||||
ST, DynamicVGPRBlockSize) >= TgtOcc) {
|
||||
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
|
||||
scheduleBest(*R);
|
||||
} else {
|
||||
LLVM_DEBUG(dbgs() << ", restoring\n");
|
||||
Ovr.restoreOrder();
|
||||
assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
|
||||
assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >=
|
||||
TgtOcc);
|
||||
}
|
||||
}
|
||||
FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
|
||||
FinalOccupancy =
|
||||
std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
|
||||
}
|
||||
}
|
||||
MFI->limitOccupancy(FinalOccupancy);
|
||||
@ -580,9 +587,11 @@ void GCNIterativeScheduler::scheduleILP(
|
||||
const auto &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
auto TgtOcc = MFI->getMinAllowedOccupancy();
|
||||
unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
|
||||
|
||||
sortRegionsByPressure(TgtOcc);
|
||||
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
|
||||
auto Occ =
|
||||
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
|
||||
|
||||
bool IsReentry = false;
|
||||
if (TryMaximizeOccupancy && Occ < TgtOcc) {
|
||||
@ -603,17 +612,18 @@ void GCNIterativeScheduler::scheduleILP(
|
||||
const auto RP = getSchedulePressure(*R, ILPSchedule);
|
||||
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
|
||||
|
||||
if (RP.getOccupancy(ST) < TgtOcc) {
|
||||
if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
|
||||
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
|
||||
if (R->BestSchedule.get() &&
|
||||
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
|
||||
if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
|
||||
ST, DynamicVGPRBlockSize) >= TgtOcc) {
|
||||
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
|
||||
scheduleBest(*R);
|
||||
}
|
||||
} else {
|
||||
scheduleRegion(*R, ILPSchedule, RP);
|
||||
LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
|
||||
FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
|
||||
FinalOccupancy =
|
||||
std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
|
||||
}
|
||||
}
|
||||
MFI->limitOccupancy(FinalOccupancy);
|
||||
|
@ -251,7 +251,9 @@ bool GCNNSAReassignImpl::run(MachineFunction &MF) {
|
||||
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
|
||||
MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
|
||||
MaxNumVGPRs = std::min(
|
||||
ST->getMaxNumVGPRs(MFI->getOccupancy(), MFI->getDynamicVGPRBlockSize()),
|
||||
MaxNumVGPRs);
|
||||
CSRegs = MRI->getCalleeSavedRegs();
|
||||
|
||||
using Candidate = std::pair<const MachineInstr*, bool>;
|
||||
|
@ -13,6 +13,7 @@
|
||||
|
||||
#include "GCNRegPressure.h"
|
||||
#include "AMDGPU.h"
|
||||
#include "SIMachineFunctionInfo.h"
|
||||
#include "llvm/CodeGen/RegisterPressure.h"
|
||||
|
||||
using namespace llvm;
|
||||
@ -94,17 +95,20 @@ void GCNRegPressure::inc(unsigned Reg,
|
||||
bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
|
||||
unsigned MaxOccupancy) const {
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
unsigned DynamicVGPRBlockSize =
|
||||
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
|
||||
|
||||
const auto SGPROcc = std::min(MaxOccupancy,
|
||||
ST.getOccupancyWithNumSGPRs(getSGPRNum()));
|
||||
const auto VGPROcc =
|
||||
std::min(MaxOccupancy,
|
||||
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
|
||||
const auto VGPROcc = std::min(
|
||||
MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
|
||||
DynamicVGPRBlockSize));
|
||||
const auto OtherSGPROcc = std::min(MaxOccupancy,
|
||||
ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
|
||||
const auto OtherVGPROcc =
|
||||
std::min(MaxOccupancy,
|
||||
ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts())));
|
||||
std::min(MaxOccupancy,
|
||||
ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()),
|
||||
DynamicVGPRBlockSize));
|
||||
|
||||
const auto Occ = std::min(SGPROcc, VGPROcc);
|
||||
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
|
||||
@ -226,13 +230,15 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
|
||||
O.getVGPRNum(ST.hasGFX90AInsts()));
|
||||
}
|
||||
|
||||
Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
|
||||
return Printable([&RP, ST](raw_ostream &OS) {
|
||||
Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
|
||||
unsigned DynamicVGPRBlockSize) {
|
||||
return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
|
||||
OS << "VGPRs: " << RP.getArchVGPRNum() << ' '
|
||||
<< "AGPRs: " << RP.getAGPRNum();
|
||||
if (ST)
|
||||
OS << "(O"
|
||||
<< ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()))
|
||||
<< ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()),
|
||||
DynamicVGPRBlockSize)
|
||||
<< ')';
|
||||
OS << ", SGPRs: " << RP.getSGPRNum();
|
||||
if (ST)
|
||||
@ -240,7 +246,7 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
|
||||
OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
|
||||
<< ", LSGPR WT: " << RP.getSGPRTuplesWeight();
|
||||
if (ST)
|
||||
OS << " -> Occ: " << RP.getOccupancy(*ST);
|
||||
OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
|
||||
OS << '\n';
|
||||
});
|
||||
}
|
||||
|
@ -69,9 +69,11 @@ struct GCNRegPressure {
|
||||
}
|
||||
unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
|
||||
|
||||
unsigned getOccupancy(const GCNSubtarget &ST) const {
|
||||
unsigned getOccupancy(const GCNSubtarget &ST,
|
||||
unsigned DynamicVGPRBlockSize) const {
|
||||
return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
|
||||
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
|
||||
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
|
||||
DynamicVGPRBlockSize));
|
||||
}
|
||||
|
||||
void inc(unsigned Reg,
|
||||
@ -79,8 +81,10 @@ struct GCNRegPressure {
|
||||
LaneBitmask NewMask,
|
||||
const MachineRegisterInfo &MRI);
|
||||
|
||||
bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const {
|
||||
return getOccupancy(ST) > O.getOccupancy(ST);
|
||||
bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O,
|
||||
unsigned DynamicVGPRBlockSize) const {
|
||||
return getOccupancy(ST, DynamicVGPRBlockSize) >
|
||||
O.getOccupancy(ST, DynamicVGPRBlockSize);
|
||||
}
|
||||
|
||||
/// Compares \p this GCNRegpressure to \p O, returning true if \p this is
|
||||
@ -133,7 +137,8 @@ private:
|
||||
friend GCNRegPressure max(const GCNRegPressure &P1,
|
||||
const GCNRegPressure &P2);
|
||||
|
||||
friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST);
|
||||
friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST,
|
||||
unsigned DynamicVGPRBlockSize);
|
||||
};
|
||||
|
||||
inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
|
||||
@ -402,7 +407,8 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
|
||||
bool isEqual(const GCNRPTracker::LiveRegSet &S1,
|
||||
const GCNRPTracker::LiveRegSet &S2);
|
||||
|
||||
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr);
|
||||
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr,
|
||||
unsigned DynamicVGPRBlockSize = 0);
|
||||
|
||||
Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
|
||||
const MachineRegisterInfo &MRI);
|
||||
|
@ -99,17 +99,20 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
|
||||
std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
|
||||
|
||||
if (!KnownExcessRP) {
|
||||
VGPRCriticalLimit =
|
||||
std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
|
||||
VGPRCriticalLimit = std::min(
|
||||
ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
|
||||
VGPRExcessLimit);
|
||||
} else {
|
||||
// This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
|
||||
// returns a reasonably small number for targets with lots of VGPRs, such
|
||||
// as GFX10 and GFX11.
|
||||
LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
|
||||
"VGPRCriticalLimit calculation method.\n");
|
||||
|
||||
unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
|
||||
unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
|
||||
unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
|
||||
unsigned Granule =
|
||||
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
|
||||
unsigned Addressable =
|
||||
AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
|
||||
unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
|
||||
VGPRBudget = std::max(VGPRBudget, Granule);
|
||||
VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
|
||||
@ -1136,7 +1139,8 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
|
||||
if (DAG.MinOccupancy > InitialOccupancy) {
|
||||
for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
|
||||
DAG.RegionsWithMinOcc[IDX] =
|
||||
DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
|
||||
DAG.Pressure[IDX].getOccupancy(
|
||||
DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
|
||||
|
||||
LLVM_DEBUG(dbgs() << StageID
|
||||
<< " stage successfully increased occupancy to "
|
||||
@ -1273,11 +1277,14 @@ void GCNSchedStage::checkScheduling() {
|
||||
LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
|
||||
LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
|
||||
|
||||
unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
|
||||
|
||||
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
|
||||
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
|
||||
DAG.Pressure[RegionIdx] = PressureAfter;
|
||||
DAG.RegionsWithMinOcc[RegionIdx] =
|
||||
PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
|
||||
PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
|
||||
DAG.MinOccupancy;
|
||||
|
||||
// Early out if we have achieved the occupancy target.
|
||||
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
|
||||
@ -1286,10 +1293,10 @@ void GCNSchedStage::checkScheduling() {
|
||||
|
||||
unsigned TargetOccupancy = std::min(
|
||||
S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
|
||||
unsigned WavesAfter =
|
||||
std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
|
||||
unsigned WavesBefore =
|
||||
std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
|
||||
unsigned WavesAfter = std::min(
|
||||
TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
|
||||
unsigned WavesBefore = std::min(
|
||||
TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
|
||||
LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
|
||||
<< ", after " << WavesAfter << ".\n");
|
||||
|
||||
@ -1338,7 +1345,8 @@ void GCNSchedStage::checkScheduling() {
|
||||
} else {
|
||||
DAG.Pressure[RegionIdx] = PressureAfter;
|
||||
DAG.RegionsWithMinOcc[RegionIdx] =
|
||||
PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
|
||||
PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
|
||||
DAG.MinOccupancy;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1461,11 +1469,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
|
||||
return true;
|
||||
|
||||
// For dynamic VGPR mode, we don't want to waste any VGPR blocks.
|
||||
if (ST.isDynamicVGPREnabled()) {
|
||||
if (DAG.MFI.isDynamicVGPREnabled()) {
|
||||
unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
|
||||
&ST, PressureBefore.getVGPRNum(false));
|
||||
&ST, DAG.MFI.getDynamicVGPRBlockSize(),
|
||||
PressureBefore.getVGPRNum(false));
|
||||
unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
|
||||
&ST, PressureAfter.getVGPRNum(false));
|
||||
&ST, DAG.MFI.getDynamicVGPRBlockSize(),
|
||||
PressureAfter.getVGPRNum(false));
|
||||
if (BlocksAfter > BlocksBefore)
|
||||
return true;
|
||||
}
|
||||
@ -1489,7 +1499,8 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
|
||||
bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
|
||||
// If RP is not reduced in the unclustered reschedule stage, revert to the
|
||||
// old schedule.
|
||||
if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
|
||||
if ((WavesAfter <=
|
||||
PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
|
||||
mayCauseSpilling(WavesAfter)) ||
|
||||
GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
|
||||
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
|
||||
@ -1511,8 +1522,9 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
|
||||
ScheduleMetrics MAfter = getScheduleMetrics(DAG);
|
||||
unsigned OldMetric = MBefore.getMetric();
|
||||
unsigned NewMetric = MAfter.getMetric();
|
||||
unsigned WavesBefore =
|
||||
std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
|
||||
unsigned WavesBefore = std::min(
|
||||
S.getTargetOccupancy(),
|
||||
PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
|
||||
unsigned Profit =
|
||||
((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
|
||||
((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
|
||||
@ -1566,7 +1578,8 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
|
||||
|
||||
void GCNSchedStage::revertScheduling() {
|
||||
DAG.RegionsWithMinOcc[RegionIdx] =
|
||||
PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
|
||||
PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) ==
|
||||
DAG.MinOccupancy;
|
||||
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
|
||||
DAG.RegionEnd = DAG.RegionBegin;
|
||||
int SkippedDebugInstr = 0;
|
||||
@ -1844,13 +1857,16 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
|
||||
// occupancy, or regions with VGPR spilling) to a model of their excess RP.
|
||||
DenseMap<unsigned, ExcessRP> OptRegions;
|
||||
const Function &F = MF.getFunction();
|
||||
unsigned DynamicVGPRBlockSize =
|
||||
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
|
||||
|
||||
std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
|
||||
const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
|
||||
const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
|
||||
const unsigned MaxSGPRsIncOcc =
|
||||
ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
|
||||
const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
|
||||
const unsigned MaxVGPRsIncOcc =
|
||||
ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
|
||||
IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
|
||||
|
||||
auto ClearOptRegionsIf = [&](bool Cond) -> bool {
|
||||
@ -2163,7 +2179,9 @@ void PreRARematStage::rematerialize() {
|
||||
}
|
||||
}
|
||||
DAG.Pressure[I] = RP;
|
||||
AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
|
||||
AchievedOcc = std::min(
|
||||
AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
|
||||
->getDynamicVGPRBlockSize()));
|
||||
}
|
||||
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
|
||||
}
|
||||
|
@ -366,8 +366,11 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
|
||||
getGeneration());
|
||||
}
|
||||
|
||||
unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
|
||||
return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
|
||||
unsigned
|
||||
GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs,
|
||||
unsigned DynamicVGPRBlockSize) const {
|
||||
return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs,
|
||||
DynamicVGPRBlockSize);
|
||||
}
|
||||
|
||||
unsigned
|
||||
@ -403,9 +406,15 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
|
||||
std::pair<unsigned, unsigned>
|
||||
GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
|
||||
unsigned NumSGPRs, unsigned NumVGPRs) const {
|
||||
unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
|
||||
// Temporarily check both the attribute and the subtarget feature until the
|
||||
// latter is removed.
|
||||
if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
|
||||
DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
|
||||
|
||||
auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
|
||||
unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
|
||||
unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
|
||||
unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
|
||||
|
||||
// Maximum occupancy may be further limited by high SGPR/VGPR usage.
|
||||
MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
|
||||
@ -512,9 +521,16 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
|
||||
}
|
||||
|
||||
unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
|
||||
// Temporarily check both the attribute and the subtarget feature, until the
|
||||
// latter is removed.
|
||||
unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
|
||||
if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
|
||||
DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
|
||||
|
||||
std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
|
||||
return getBaseMaxNumVGPRs(
|
||||
F, {getMinNumVGPRs(Waves.second), getMaxNumVGPRs(Waves.first)});
|
||||
F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
|
||||
getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
|
||||
}
|
||||
|
||||
unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
|
||||
|
@ -1391,7 +1391,8 @@ public:
|
||||
|
||||
/// Return the maximum number of waves per SIMD for kernels using \p VGPRs
|
||||
/// VGPRs
|
||||
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
|
||||
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
|
||||
unsigned DynamicVGPRBlockSize) const;
|
||||
|
||||
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
|
||||
/// be achieved when the only function running on a CU is \p F, each workgroup
|
||||
@ -1549,8 +1550,8 @@ public:
|
||||
unsigned getMaxNumSGPRs(const Function &F) const;
|
||||
|
||||
/// \returns VGPR allocation granularity supported by the subtarget.
|
||||
unsigned getVGPRAllocGranule() const {
|
||||
return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
|
||||
unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
|
||||
return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
|
||||
}
|
||||
|
||||
/// \returns VGPR encoding granularity supported by the subtarget.
|
||||
@ -1570,20 +1571,24 @@ public:
|
||||
}
|
||||
|
||||
/// \returns Addressable number of VGPRs supported by the subtarget.
|
||||
unsigned getAddressableNumVGPRs() const {
|
||||
return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
|
||||
unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
|
||||
return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
|
||||
}
|
||||
|
||||
/// \returns the minimum number of VGPRs that will prevent achieving more than
|
||||
/// the specified number of waves \p WavesPerEU.
|
||||
unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
|
||||
return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
|
||||
unsigned getMinNumVGPRs(unsigned WavesPerEU,
|
||||
unsigned DynamicVGPRBlockSize) const {
|
||||
return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
|
||||
DynamicVGPRBlockSize);
|
||||
}
|
||||
|
||||
/// \returns the maximum number of VGPRs that can be used and still achieved
|
||||
/// at least the specified number of waves \p WavesPerEU.
|
||||
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
|
||||
return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
|
||||
unsigned getMaxNumVGPRs(unsigned WavesPerEU,
|
||||
unsigned DynamicVGPRBlockSize) const {
|
||||
return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
|
||||
DynamicVGPRBlockSize);
|
||||
}
|
||||
|
||||
/// \returns max num VGPRs. This is the common utility function
|
||||
@ -1686,6 +1691,9 @@ public:
|
||||
}
|
||||
|
||||
bool isDynamicVGPREnabled() const { return DynamicVGPR; }
|
||||
unsigned getDynamicVGPRBlockSize() const {
|
||||
return DynamicVGPRBlockSize32 ? 32 : 16;
|
||||
}
|
||||
|
||||
bool requiresDisjointEarlyClobberAndUndef() const override {
|
||||
// AMDGPU doesn't care if early-clobber and undef operands are allocated
|
||||
|
@ -313,13 +313,11 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
|
||||
/// Remove dependency on GCNSubtarget and depend only only the necessary values
|
||||
/// for said occupancy computation. Should match computeOccupancy implementation
|
||||
/// without passing \p STM on.
|
||||
const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc,
|
||||
const MCExpr *NumSGPRs,
|
||||
const MCExpr *NumVGPRs,
|
||||
const GCNSubtarget &STM,
|
||||
MCContext &Ctx) {
|
||||
const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
|
||||
unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
|
||||
unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
|
||||
unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
|
||||
unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
|
||||
unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
|
||||
unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
|
||||
unsigned Generation = STM.getGeneration();
|
||||
|
||||
|
@ -93,11 +93,10 @@ public:
|
||||
return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
|
||||
}
|
||||
|
||||
static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc,
|
||||
const MCExpr *NumSGPRs,
|
||||
const MCExpr *NumVGPRs,
|
||||
const GCNSubtarget &STM,
|
||||
MCContext &Ctx);
|
||||
static const AMDGPUMCExpr *
|
||||
createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
|
||||
const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
|
||||
const GCNSubtarget &STM, MCContext &Ctx);
|
||||
|
||||
ArrayRef<const MCExpr *> getArgs() const { return Args; }
|
||||
VariantKind getKind() const { return Kind; }
|
||||
|
@ -197,7 +197,9 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
|
||||
// pointer becomes dead and could otherwise be reused for destination.
|
||||
RPT.advanceToNext();
|
||||
GCNRegPressure MaxPressure = RPT.moveMaxPressure();
|
||||
unsigned Occupancy = MaxPressure.getOccupancy(*ST);
|
||||
unsigned Occupancy = MaxPressure.getOccupancy(
|
||||
*ST,
|
||||
MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
|
||||
|
||||
// Don't push over half the register budget. We don't want to introduce
|
||||
// spilling just to form a soft clause.
|
||||
|
@ -714,11 +714,12 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
|
||||
assert(hasFP(MF));
|
||||
Register FPReg = MFI->getFrameOffsetReg();
|
||||
assert(FPReg != AMDGPU::FP_REG);
|
||||
unsigned VGPRSize =
|
||||
llvm::alignTo((ST.getAddressableNumVGPRs() -
|
||||
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) *
|
||||
4,
|
||||
FrameInfo.getMaxAlign());
|
||||
unsigned VGPRSize = llvm::alignTo(
|
||||
(ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) -
|
||||
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST,
|
||||
MFI->getDynamicVGPRBlockSize())) *
|
||||
4,
|
||||
FrameInfo.getMaxAlign());
|
||||
MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
|
||||
|
||||
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
|
||||
@ -2087,7 +2088,7 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
|
||||
|
||||
bool SIFrameLowering::mayReserveScratchForCWSR(
|
||||
const MachineFunction &MF) const {
|
||||
return MF.getSubtarget<GCNSubtarget>().isDynamicVGPREnabled() &&
|
||||
return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
|
||||
AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) &&
|
||||
AMDGPU::isCompute(MF.getFunction().getCallingConv());
|
||||
}
|
||||
|
@ -1760,7 +1760,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
|
||||
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
|
||||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
|
||||
if (!WCG->isOptNone() &&
|
||||
(ST->isDynamicVGPREnabled() ||
|
||||
(MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
|
||||
(ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
|
||||
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
|
||||
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
|
||||
@ -2652,7 +2652,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
|
||||
Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
|
||||
Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
|
||||
|
||||
[[maybe_unused]] unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
|
||||
[[maybe_unused]] unsigned NumVGPRsMax =
|
||||
ST->getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize());
|
||||
[[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
|
||||
assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
|
||||
assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
|
||||
@ -2821,7 +2822,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
|
||||
// (i.e. whether we're in dynamic VGPR mode or not).
|
||||
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
|
||||
// waveslot limited kernel runs slower with the deallocation.
|
||||
if (ST->isDynamicVGPREnabled()) {
|
||||
if (MFI->isDynamicVGPREnabled()) {
|
||||
for (MachineInstr *MI : ReleaseVGPRInsts) {
|
||||
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
|
||||
TII->get(AMDGPU::S_ALLOC_VGPR))
|
||||
@ -2832,7 +2833,8 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
|
||||
if (!ReleaseVGPRInsts.empty() &&
|
||||
(MF.getFrameInfo().hasCalls() ||
|
||||
ST->getOccupancyWithNumVGPRs(
|
||||
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
|
||||
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
|
||||
/*IsDynamicVGPR=*/false) <
|
||||
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
|
||||
for (MachineInstr *MI : ReleaseVGPRInsts) {
|
||||
if (ST->requiresNopBeforeDeallocVGPRs()) {
|
||||
|
@ -48,6 +48,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
|
||||
MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
|
||||
assert(MaxNumWorkGroups.size() == 3);
|
||||
|
||||
// Temporarily check both the attribute and the subtarget feature, until the
|
||||
// latter is completely removed.
|
||||
DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
|
||||
if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
|
||||
DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
|
||||
|
||||
Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
|
||||
CallingConv::ID CC = F.getCallingConv();
|
||||
|
||||
@ -716,6 +722,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
|
||||
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
|
||||
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
|
||||
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
|
||||
DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
|
||||
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
|
||||
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
|
||||
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
|
||||
|
@ -299,6 +299,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
|
||||
|
||||
bool HasInitWholeWave = false;
|
||||
|
||||
unsigned DynamicVGPRBlockSize = 0;
|
||||
unsigned ScratchReservedForDynamicVGPRs = 0;
|
||||
|
||||
SIMachineFunctionInfo() = default;
|
||||
@ -352,6 +353,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
|
||||
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
|
||||
StringValue());
|
||||
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
|
||||
YamlIO.mapOptional("dynamicVGPRBlockSize", MFI.DynamicVGPRBlockSize, false);
|
||||
YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
|
||||
MFI.ScratchReservedForDynamicVGPRs, 0);
|
||||
}
|
||||
@ -469,6 +471,8 @@ private:
|
||||
unsigned NumSpilledSGPRs = 0;
|
||||
unsigned NumSpilledVGPRs = 0;
|
||||
|
||||
unsigned DynamicVGPRBlockSize = 0;
|
||||
|
||||
// The size in bytes of the scratch space reserved for the CWSR trap handler
|
||||
// to spill some of the dynamic VGPRs.
|
||||
unsigned ScratchReservedForDynamicVGPRs = 0;
|
||||
@ -820,6 +824,9 @@ public:
|
||||
BytesInStackArgArea = Bytes;
|
||||
}
|
||||
|
||||
bool isDynamicVGPREnabled() const { return DynamicVGPRBlockSize != 0; }
|
||||
unsigned getDynamicVGPRBlockSize() const { return DynamicVGPRBlockSize; }
|
||||
|
||||
// This is only used if we need to save any dynamic VGPRs in scratch.
|
||||
unsigned getScratchReservedForDynamicVGPRs() const {
|
||||
return ScratchReservedForDynamicVGPRs;
|
||||
|
@ -3748,7 +3748,11 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
|
||||
default:
|
||||
return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
|
||||
case AMDGPU::VGPR_32RegClassID:
|
||||
return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF));
|
||||
return std::min(
|
||||
ST.getMaxNumVGPRs(
|
||||
MinOcc,
|
||||
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize()),
|
||||
ST.getMaxNumVGPRs(MF));
|
||||
case AMDGPU::SGPR_32RegClassID:
|
||||
case AMDGPU::SGPR_LO16RegClassID:
|
||||
return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
|
||||
|
@ -1158,10 +1158,16 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
|
||||
}
|
||||
|
||||
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
|
||||
unsigned DynamicVGPRBlockSize,
|
||||
std::optional<bool> EnableWavefrontSize32) {
|
||||
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
|
||||
return 8;
|
||||
|
||||
if (DynamicVGPRBlockSize != 0)
|
||||
return DynamicVGPRBlockSize;
|
||||
|
||||
// Temporarily check the subtarget feature, until we fully switch to using
|
||||
// attributes.
|
||||
if (STI->getFeatureBits().test(FeatureDynamicVGPR))
|
||||
return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
|
||||
|
||||
@ -1205,20 +1211,26 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
|
||||
|
||||
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
|
||||
|
||||
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
|
||||
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
|
||||
unsigned DynamicVGPRBlockSize) {
|
||||
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
|
||||
return 512;
|
||||
if (STI->getFeatureBits().test(FeatureDynamicVGPR))
|
||||
|
||||
// Temporarily check the subtarget feature, until we fully switch to using
|
||||
// attributes.
|
||||
if (DynamicVGPRBlockSize != 0 ||
|
||||
STI->getFeatureBits().test(FeatureDynamicVGPR))
|
||||
// On GFX12 we can allocate at most 8 blocks of VGPRs.
|
||||
return 8 * getVGPRAllocGranule(STI);
|
||||
return 8 * getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
|
||||
return getAddressableNumArchVGPRs(STI);
|
||||
}
|
||||
|
||||
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
|
||||
unsigned NumVGPRs) {
|
||||
return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI),
|
||||
getMaxWavesPerEU(STI),
|
||||
getTotalNumVGPRs(STI));
|
||||
unsigned NumVGPRs,
|
||||
unsigned DynamicVGPRBlockSize) {
|
||||
return getNumWavesPerEUWithNumVGPRs(
|
||||
NumVGPRs, getVGPRAllocGranule(STI, DynamicVGPRBlockSize),
|
||||
getMaxWavesPerEU(STI), getTotalNumVGPRs(STI));
|
||||
}
|
||||
|
||||
unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
|
||||
@ -1257,7 +1269,8 @@ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
|
||||
return 5;
|
||||
}
|
||||
|
||||
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
|
||||
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
|
||||
unsigned DynamicVGPRBlockSize) {
|
||||
assert(WavesPerEU != 0);
|
||||
|
||||
unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
|
||||
@ -1265,28 +1278,33 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
|
||||
return 0;
|
||||
|
||||
unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
|
||||
unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI);
|
||||
unsigned Granule = getVGPRAllocGranule(STI);
|
||||
unsigned AddrsableNumVGPRs =
|
||||
getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
|
||||
unsigned Granule = getVGPRAllocGranule(STI, DynamicVGPRBlockSize);
|
||||
unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);
|
||||
|
||||
if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
|
||||
return 0;
|
||||
|
||||
unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs);
|
||||
unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs,
|
||||
DynamicVGPRBlockSize);
|
||||
if (WavesPerEU < MinWavesPerEU)
|
||||
return getMinNumVGPRs(STI, MinWavesPerEU);
|
||||
return getMinNumVGPRs(STI, MinWavesPerEU, DynamicVGPRBlockSize);
|
||||
|
||||
unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
|
||||
unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
|
||||
return std::min(MinNumVGPRs, AddrsableNumVGPRs);
|
||||
}
|
||||
|
||||
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
|
||||
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
|
||||
unsigned DynamicVGPRBlockSize) {
|
||||
assert(WavesPerEU != 0);
|
||||
|
||||
unsigned MaxNumVGPRs =
|
||||
alignDown(getTotalNumVGPRs(STI) / WavesPerEU, getVGPRAllocGranule(STI));
|
||||
unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI);
|
||||
alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
|
||||
getVGPRAllocGranule(STI, DynamicVGPRBlockSize));
|
||||
unsigned AddressableNumVGPRs =
|
||||
getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
|
||||
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
|
||||
}
|
||||
|
||||
@ -1299,9 +1317,11 @@ unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
|
||||
|
||||
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI,
|
||||
unsigned NumVGPRs,
|
||||
unsigned DynamicVGPRBlockSize,
|
||||
std::optional<bool> EnableWavefrontSize32) {
|
||||
return getGranulatedNumRegisterBlocks(
|
||||
NumVGPRs, getVGPRAllocGranule(STI, EnableWavefrontSize32));
|
||||
NumVGPRs,
|
||||
getVGPRAllocGranule(STI, DynamicVGPRBlockSize, EnableWavefrontSize32));
|
||||
}
|
||||
} // end namespace IsaInfo
|
||||
|
||||
@ -2124,6 +2144,16 @@ bool getHasDepthExport(const Function &F) {
|
||||
return F.getFnAttributeAsParsedInteger("amdgpu-depth-export", 0) != 0;
|
||||
}
|
||||
|
||||
unsigned getDynamicVGPRBlockSize(const Function &F) {
|
||||
unsigned BlockSize =
|
||||
F.getFnAttributeAsParsedInteger("amdgpu-dynamic-vgpr-block-size", 0);
|
||||
|
||||
if (BlockSize == 16 || BlockSize == 32)
|
||||
return BlockSize;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool hasXNACK(const MCSubtargetInfo &STI) {
|
||||
return STI.hasFeature(AMDGPU::FeatureXNACK);
|
||||
}
|
||||
|
@ -298,7 +298,7 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
|
||||
/// For subtargets which support it, \p EnableWavefrontSize32 should match
|
||||
/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
|
||||
unsigned
|
||||
getVGPRAllocGranule(const MCSubtargetInfo *STI,
|
||||
getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize,
|
||||
std::optional<bool> EnableWavefrontSize32 = std::nullopt);
|
||||
|
||||
/// \returns VGPR encoding granularity for given subtarget \p STI.
|
||||
@ -321,20 +321,24 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
|
||||
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);
|
||||
|
||||
/// \returns Addressable number of VGPRs for given subtarget \p STI.
|
||||
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
|
||||
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI,
|
||||
unsigned DynamicVGPRBlockSize);
|
||||
|
||||
/// \returns Minimum number of VGPRs that meets given number of waves per
|
||||
/// execution unit requirement for given subtarget \p STI.
|
||||
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
|
||||
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
|
||||
unsigned DynamicVGPRBlockSize);
|
||||
|
||||
/// \returns Maximum number of VGPRs that meets given number of waves per
|
||||
/// execution unit requirement for given subtarget \p STI.
|
||||
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
|
||||
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
|
||||
unsigned DynamicVGPRBlockSize);
|
||||
|
||||
/// \returns Number of waves reachable for a given \p NumVGPRs usage for given
|
||||
/// subtarget \p STI.
|
||||
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
|
||||
unsigned NumVGPRs);
|
||||
unsigned NumVGPRs,
|
||||
unsigned DynamicVGPRBlockSize);
|
||||
|
||||
/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
|
||||
/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
|
||||
@ -361,6 +365,7 @@ unsigned getEncodedNumVGPRBlocks(
|
||||
/// subtarget \p STI when \p NumVGPRs are used.
|
||||
unsigned getAllocatedNumVGPRBlocks(
|
||||
const MCSubtargetInfo *STI, unsigned NumVGPRs,
|
||||
unsigned DynamicVGPRBlockSize,
|
||||
std::optional<bool> EnableWavefrontSize32 = std::nullopt);
|
||||
|
||||
} // end namespace IsaInfo
|
||||
@ -1305,6 +1310,12 @@ bool getHasColorExport(const Function &F);
|
||||
|
||||
bool getHasDepthExport(const Function &F);
|
||||
|
||||
bool hasDynamicVGPR(const Function &F);
|
||||
|
||||
// Returns the value of the "amdgpu-dynamic-vgpr-block-size" attribute, or 0 if
|
||||
// the attribute is missing or its value is invalid.
|
||||
unsigned getDynamicVGPRBlockSize(const Function &F);
|
||||
|
||||
LLVM_READNONE
|
||||
constexpr bool isShader(CallingConv::ID CC) {
|
||||
switch (CC) {
|
||||
|
@ -1,6 +1,6 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=CHECK,CHECK-TRUE16 %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=CHECK,CHECK-FAKE16 %s
|
||||
|
||||
; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
|
||||
|
||||
@ -137,7 +137,7 @@ define amdgpu_cs void @with_calls_no_inline_const() #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_cs void @with_spills() {
|
||||
define amdgpu_cs void @with_spills() #0 {
|
||||
; CHECK-LABEL: with_spills:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
|
||||
@ -366,6 +366,6 @@ define void @default() #0 {
|
||||
|
||||
declare amdgpu_gfx void @callee(i32) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind "frame-pointer"="none" }
|
||||
attributes #2 = { nounwind "frame-pointer"="all" }
|
||||
attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
|
||||
attributes #1 = { nounwind "frame-pointer"="none" "amdgpu-dynamic-vgpr-block-size"="16" }
|
||||
attributes #2 = { nounwind "frame-pointer"="all" "amdgpu-dynamic-vgpr-block-size"="16" }
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
|
||||
|
||||
; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
|
||||
|
||||
@ -68,5 +68,5 @@ define void @default() #0 {
|
||||
|
||||
declare amdgpu_gfx void @callee(i32) #0
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size" = "16" }
|
||||
|
||||
|
305
llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
Normal file
305
llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
Normal file
@ -0,0 +1,305 @@
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; CHECK: .amdgpu_pal_metadata
|
||||
; CHECK-NEXT: ---
|
||||
; CHECK-NEXT: amdpal.pipelines:
|
||||
; CHECK-NEXT: - .api: Vulkan
|
||||
; CHECK-NEXT: .compute_registers:
|
||||
; CHECK-NEXT: .dynamic_vgpr_en: true
|
||||
; CHECK-NEXT: .tg_size_en: true
|
||||
; CHECK-NEXT: .tgid_x_en: false
|
||||
; CHECK-NEXT: .tgid_y_en: false
|
||||
; CHECK-NEXT: .tgid_z_en: false
|
||||
; CHECK-NEXT: .tidig_comp_cnt: 0x1
|
||||
; CHECK-NEXT: .hardware_stages:
|
||||
; CHECK-NEXT: .cs:
|
||||
; CHECK-NEXT: .checksum_value: 0x9444d7d0
|
||||
; CHECK-NEXT: .debug_mode: 0
|
||||
; CHECK-NEXT: .excp_en: 0
|
||||
; CHECK-NEXT: .float_mode: 0xc0
|
||||
; CHECK-NEXT: .image_op: false
|
||||
; CHECK-NEXT: .lds_size: 0x200
|
||||
; CHECK-NEXT: .mem_ordered: true
|
||||
; CHECK-NEXT: .sgpr_limit: 0x6a
|
||||
; CHECK-NEXT: .threadgroup_dimensions:
|
||||
; CHECK-NEXT: - 0x1
|
||||
; CHECK-NEXT: - 0x400
|
||||
; CHECK-NEXT: - 0x1
|
||||
; CHECK-NEXT: .trap_present: false
|
||||
; CHECK-NEXT: .user_data_reg_map:
|
||||
; CHECK-NEXT: - 0x10000000
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: .user_sgprs: 0x3
|
||||
; CHECK-NEXT: .vgpr_limit: 0x100
|
||||
; CHECK-NEXT: .wavefront_size: 0x40
|
||||
; CHECK-NEXT: .wgp_mode: true
|
||||
; CHECK: .registers: {}
|
||||
; CHECK-NEXT: .shader_functions:
|
||||
; CHECK-NEXT: dynamic_stack:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x10
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x22
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
|
||||
; CHECK-NEXT: .vgpr_count: 0x2
|
||||
; CHECK-NEXT: dynamic_stack_loop:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x10
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x22
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
|
||||
; CHECK-NEXT: .vgpr_count: 0x3
|
||||
; CHECK-NEXT: multiple_stack:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x24
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24
|
||||
; CHECK-NEXT: .vgpr_count: 0x3
|
||||
; CHECK-NEXT: no_stack:
|
||||
; CHECK-NEXT: .backend_stack_size: 0
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
|
||||
; CHECK-NEXT: .vgpr_count: 0x1
|
||||
; CHECK-NEXT: no_stack_call:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x10
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x22
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
|
||||
; CHECK-NEXT: .vgpr_count: 0x3
|
||||
; CHECK-NEXT: no_stack_extern_call:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x10
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x24
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
|
||||
; CHECK-NEXT: .vgpr_count: 0x58
|
||||
; CHECK-NEXT: no_stack_extern_call_many_args:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x90
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x24
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x90
|
||||
; CHECK-NEXT: .vgpr_count: 0x58
|
||||
; CHECK-NEXT: no_stack_indirect_call:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x10
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x24
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
|
||||
; CHECK-NEXT: .vgpr_count: 0x58
|
||||
; CHECK-NEXT: simple_lds:
|
||||
; CHECK-NEXT: .backend_stack_size: 0
|
||||
; CHECK-NEXT: .lds_size: 0x100
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0
|
||||
; CHECK-NEXT: .vgpr_count: 0x1
|
||||
; CHECK-NEXT: simple_lds_recurse:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x10
|
||||
; CHECK-NEXT: .lds_size: 0x100
|
||||
; CHECK-NEXT: .sgpr_count: 0x24
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
|
||||
; CHECK-NEXT: .vgpr_count: 0x29
|
||||
; CHECK-NEXT: simple_stack:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x14
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14
|
||||
; CHECK-NEXT: .vgpr_count: 0x2
|
||||
; CHECK-NEXT: simple_stack_call:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x20
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x22
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
|
||||
; CHECK-NEXT: .vgpr_count: 0x4
|
||||
; CHECK-NEXT: simple_stack_extern_call:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x20
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x24
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
|
||||
; CHECK-NEXT: .vgpr_count: 0x58
|
||||
; CHECK-NEXT: simple_stack_indirect_call:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x20
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x24
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
|
||||
; CHECK-NEXT: .vgpr_count: 0x58
|
||||
; CHECK-NEXT: simple_stack_recurse:
|
||||
; CHECK-NEXT: .backend_stack_size: 0x20
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x24
|
||||
; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
|
||||
; CHECK-NEXT: .vgpr_count: 0x2a
|
||||
; CHECK:amdpal.version:
|
||||
; CHECK-NEXT: - 0x3
|
||||
; CHECK-NEXT: - 0
|
||||
; CHECK-NEXT:...
|
||||
; CHECK-NEXT: .end_amdgpu_pal_metadata
|
||||
|
||||
declare amdgpu_gfx float @extern_func(float) #0
|
||||
declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
|
||||
|
||||
@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4
|
||||
|
||||
define amdgpu_gfx float @no_stack(float %arg0) #0 {
|
||||
%add = fadd float %arg0, 1.0
|
||||
ret float %add
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @simple_stack(float %arg0) #0 {
|
||||
%stack = alloca float, i32 4, align 4, addrspace(5)
|
||||
store volatile float 2.0, ptr addrspace(5) %stack
|
||||
%val = load volatile float, ptr addrspace(5) %stack
|
||||
%add = fadd float %arg0, %val
|
||||
ret float %add
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
|
||||
%stack = alloca float, i32 4, align 4, addrspace(5)
|
||||
store volatile float 2.0, ptr addrspace(5) %stack
|
||||
%val = load volatile float, ptr addrspace(5) %stack
|
||||
%add = fadd float %arg0, %val
|
||||
%stack2 = alloca float, i32 4, align 4, addrspace(5)
|
||||
store volatile float 2.0, ptr addrspace(5) %stack2
|
||||
%val2 = load volatile float, ptr addrspace(5) %stack2
|
||||
%add2 = fadd float %add, %val2
|
||||
ret float %add2
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
|
||||
bb0:
|
||||
%cmp = fcmp ogt float %arg0, 0.0
|
||||
br i1 %cmp, label %bb1, label %bb2
|
||||
|
||||
bb1:
|
||||
%stack = alloca float, i32 4, align 4, addrspace(5)
|
||||
store volatile float 2.0, ptr addrspace(5) %stack
|
||||
%val = load volatile float, ptr addrspace(5) %stack
|
||||
%add = fadd float %arg0, %val
|
||||
br label %bb2
|
||||
|
||||
bb2:
|
||||
%res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
|
||||
bb0:
|
||||
br label %bb1
|
||||
|
||||
bb1:
|
||||
%ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
|
||||
%stack = alloca float, i32 4, align 4, addrspace(5)
|
||||
store volatile float 2.0, ptr addrspace(5) %stack
|
||||
%val = load volatile float, ptr addrspace(5) %stack
|
||||
%add = fadd float %arg0, %val
|
||||
%cmp = icmp sgt i32 %ctr, 0
|
||||
%newctr = sub i32 %ctr, 1
|
||||
br i1 %cmp, label %bb1, label %bb2
|
||||
|
||||
bb2:
|
||||
ret float %add
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
|
||||
%res = call amdgpu_gfx float @simple_stack(float %arg0)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
|
||||
%stack = alloca float, i32 4, align 4, addrspace(5)
|
||||
store volatile float 2.0, ptr addrspace(5) %stack
|
||||
%val = load volatile float, ptr addrspace(5) %stack
|
||||
%res = call amdgpu_gfx float @simple_stack(float %arg0)
|
||||
%add = fadd float %res, %val
|
||||
ret float %add
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
|
||||
%res = call amdgpu_gfx float @extern_func(float %arg0)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
|
||||
%stack = alloca float, i32 4, align 4, addrspace(5)
|
||||
store volatile float 2.0, ptr addrspace(5) %stack
|
||||
%val = load volatile float, ptr addrspace(5) %stack
|
||||
%res = call amdgpu_gfx float @extern_func(float %arg0)
|
||||
%add = fadd float %res, %val
|
||||
ret float %add
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
|
||||
%res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
|
||||
%fptr = load ptr, ptr addrspace(4) @funcptr
|
||||
call amdgpu_gfx void %fptr()
|
||||
ret float %arg0
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
|
||||
%stack = alloca float, i32 4, align 4, addrspace(5)
|
||||
store volatile float 2.0, ptr addrspace(5) %stack
|
||||
%val = load volatile float, ptr addrspace(5) %stack
|
||||
%fptr = load ptr, ptr addrspace(4) @funcptr
|
||||
call amdgpu_gfx void %fptr()
|
||||
%add = fadd float %arg0, %val
|
||||
ret float %add
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
|
||||
%stack = alloca float, i32 4, align 4, addrspace(5)
|
||||
store volatile float 2.0, ptr addrspace(5) %stack
|
||||
%val = load volatile float, ptr addrspace(5) %stack
|
||||
%res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
|
||||
%add = fadd float %res, %val
|
||||
ret float %add
|
||||
}
|
||||
|
||||
@lds = internal addrspace(3) global [64 x float] poison
|
||||
|
||||
define amdgpu_gfx float @simple_lds(float %arg0) #0 {
|
||||
%val = load float, ptr addrspace(3) @lds
|
||||
ret float %val
|
||||
}
|
||||
|
||||
define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
|
||||
%val = load float, ptr addrspace(3) @lds
|
||||
%res = call amdgpu_gfx float @simple_lds_recurse(float %val)
|
||||
ret float %res
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-dynamic-vgpr-block-size"="16" }
|
||||
|
||||
!amdgpu.pal.metadata.msgpack = !{!0}
|
||||
|
||||
!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
|
||||
!1 = !{i32 7}
|
205
llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
Normal file
205
llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
Normal file
@ -0,0 +1,205 @@
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: {{^}}_amdgpu_cs_main:
|
||||
; CHECK: ; TotalNumSgprs: 34
|
||||
; CHECK: ; NumVgprs: 2
|
||||
; CHECK: .amdgpu_pal_metadata
|
||||
; CHECK-NEXT: ---
|
||||
; CHECK-NEXT: amdpal.pipelines:
|
||||
; CHECK-NEXT: - .api: Vulkan
|
||||
; CHECK-NEXT: .compute_registers:
|
||||
; CHECK-NEXT: .dynamic_vgpr_en: true
|
||||
; CHECK-NEXT: .tg_size_en: true
|
||||
; CHECK-NEXT: .tgid_x_en: false
|
||||
; CHECK-NEXT: .tgid_y_en: false
|
||||
; CHECK-NEXT: .tgid_z_en: false
|
||||
; CHECK-NEXT: .tidig_comp_cnt: 0x1
|
||||
; CHECK-NEXT: .graphics_registers:
|
||||
; CHECK-NEXT: .ps_extra_lds_size: 0
|
||||
; CHECK-NEXT: .spi_ps_input_addr:
|
||||
; CHECK-NEXT: .ancillary_ena: false
|
||||
; CHECK-NEXT: .front_face_ena: true
|
||||
; CHECK-NEXT: .line_stipple_tex_ena: false
|
||||
; CHECK-NEXT: .linear_center_ena: true
|
||||
; CHECK-NEXT: .linear_centroid_ena: true
|
||||
; CHECK-NEXT: .linear_sample_ena: true
|
||||
; CHECK-NEXT: .persp_center_ena: true
|
||||
; CHECK-NEXT: .persp_centroid_ena: true
|
||||
; CHECK-NEXT: .persp_pull_model_ena: false
|
||||
; CHECK-NEXT: .persp_sample_ena: true
|
||||
; CHECK-NEXT: .pos_fixed_pt_ena: true
|
||||
; CHECK-NEXT: .pos_w_float_ena: false
|
||||
; CHECK-NEXT: .pos_x_float_ena: false
|
||||
; CHECK-NEXT: .pos_y_float_ena: false
|
||||
; CHECK-NEXT: .pos_z_float_ena: false
|
||||
; CHECK-NEXT: .sample_coverage_ena: false
|
||||
; CHECK-NEXT: .spi_ps_input_ena:
|
||||
; CHECK-NEXT: .ancillary_ena: false
|
||||
; CHECK-NEXT: .front_face_ena: false
|
||||
; CHECK-NEXT: .line_stipple_tex_ena: false
|
||||
; CHECK-NEXT: .linear_center_ena: false
|
||||
; CHECK-NEXT: .linear_centroid_ena: false
|
||||
; CHECK-NEXT: .linear_sample_ena: false
|
||||
; CHECK-NEXT: .persp_center_ena: false
|
||||
; CHECK-NEXT: .persp_centroid_ena: false
|
||||
; CHECK-NEXT: .persp_pull_model_ena: false
|
||||
; CHECK-NEXT: .persp_sample_ena: true
|
||||
; CHECK-NEXT: .pos_fixed_pt_ena: false
|
||||
; CHECK-NEXT: .pos_w_float_ena: false
|
||||
; CHECK-NEXT: .pos_x_float_ena: false
|
||||
; CHECK-NEXT: .pos_y_float_ena: false
|
||||
; CHECK-NEXT: .pos_z_float_ena: false
|
||||
; CHECK-NEXT: .sample_coverage_ena: false
|
||||
; CHECK-NEXT: .hardware_stages:
|
||||
; CHECK-NEXT: .cs:
|
||||
; CHECK-NEXT: .checksum_value: 0x9444d7d0
|
||||
; CHECK-NEXT: .debug_mode: false
|
||||
; CHECK-NEXT: .dynamic_vgpr_saved_count: 0x70
|
||||
; CHECK-NEXT: .entry_point: _amdgpu_cs
|
||||
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
|
||||
; CHECK-NEXT: .excp_en: 0
|
||||
; CHECK-NEXT: .float_mode: 0xc0
|
||||
; CHECK-NEXT: .image_op: false
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .mem_ordered: true
|
||||
; CHECK-NEXT: .scratch_en: false
|
||||
; CHECK-NEXT: .scratch_memory_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x22
|
||||
; CHECK-NEXT: .sgpr_limit: 0x6a
|
||||
; CHECK-NEXT: .threadgroup_dimensions:
|
||||
; CHECK-NEXT: - 0x1
|
||||
; CHECK-NEXT: - 0x400
|
||||
; CHECK-NEXT: - 0x1
|
||||
; CHECK-NEXT: .trap_present: false
|
||||
; CHECK-NEXT: .user_data_reg_map:
|
||||
; CHECK-NEXT: - 0x10000000
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: - 0xffffffff
|
||||
; CHECK-NEXT: .user_sgprs: 0x3
|
||||
; CHECK-NEXT: .vgpr_count: 0x2
|
||||
; CHECK-NEXT: .vgpr_limit: 0x100
|
||||
; CHECK-NEXT: .wavefront_size: 0x40
|
||||
; CHECK-NEXT: .wgp_mode: false
|
||||
; CHECK-NEXT: .gs:
|
||||
; CHECK-NEXT: .debug_mode: false
|
||||
; CHECK-NEXT: .entry_point: _amdgpu_gs
|
||||
; CHECK-NEXT: .entry_point_symbol: gs_shader
|
||||
; CHECK-NEXT: .lds_size: 0x200
|
||||
; CHECK-NEXT: .mem_ordered: true
|
||||
; CHECK-NEXT: .scratch_en: false
|
||||
; CHECK-NEXT: .scratch_memory_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .vgpr_count: 0x1
|
||||
; CHECK-NEXT: .wgp_mode: true
|
||||
; CHECK-NEXT: .hs:
|
||||
; CHECK-NEXT: .debug_mode: false
|
||||
; CHECK-NEXT: .entry_point: _amdgpu_hs
|
||||
; CHECK-NEXT: .entry_point_symbol: hs_shader
|
||||
; CHECK-NEXT: .lds_size: 0x1000
|
||||
; CHECK-NEXT: .mem_ordered: true
|
||||
; CHECK-NEXT: .scratch_en: false
|
||||
; CHECK-NEXT: .scratch_memory_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .vgpr_count: 0x1
|
||||
; CHECK-NEXT: .wgp_mode: true
|
||||
; CHECK-NEXT: .ps:
|
||||
; CHECK-NEXT: .debug_mode: false
|
||||
; CHECK-NEXT: .entry_point: _amdgpu_ps
|
||||
; CHECK-NEXT: .entry_point_symbol: ps_shader
|
||||
; CHECK-NEXT: .lds_size: 0
|
||||
; CHECK-NEXT: .mem_ordered: true
|
||||
; CHECK-NEXT: .scratch_en: false
|
||||
; CHECK-NEXT: .scratch_memory_size: 0
|
||||
; CHECK-NEXT: .sgpr_count: 0x1
|
||||
; CHECK-NEXT: .vgpr_count: 0x1
|
||||
; CHECK-NEXT: .wgp_mode: true
|
||||
; CHECK: .registers: {}
|
||||
; CHECK:amdpal.version:
|
||||
; CHECK-NEXT: - 0x3
|
||||
; CHECK-NEXT: - 0
|
||||
; CHECK-NEXT:...
|
||||
; CHECK-NEXT: .end_amdgpu_pal_metadata
|
||||
|
||||
define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
|
||||
.entry:
|
||||
%i = call i64 @llvm.amdgcn.s.getpc()
|
||||
%i1 = and i64 %i, -4294967296
|
||||
%i2 = zext i32 %arg1 to i64
|
||||
%i3 = or i64 %i1, %i2
|
||||
%i4 = inttoptr i64 %i3 to ptr addrspace(4)
|
||||
%i5 = and i32 %arg2, 1023
|
||||
%i6 = lshr i32 %arg2, 10
|
||||
%i7 = and i32 %i6, 1023
|
||||
%i8 = add nuw nsw i32 %i7, %i5
|
||||
%i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
|
||||
%.idx = shl nuw nsw i32 %i8, 2
|
||||
call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
define dllexport amdgpu_ps void @ps_shader() #1 {
|
||||
ret void
|
||||
}
|
||||
|
||||
@LDS.GS = external addrspace(3) global [1 x i32], align 4
|
||||
|
||||
define dllexport amdgpu_gs void @gs_shader() #2 {
|
||||
%ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
|
||||
store i32 0, ptr addrspace(3) %ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@LDS.HS = external addrspace(3) global [1024 x i32], align 4
|
||||
|
||||
define dllexport amdgpu_hs void @hs_shader() #2 {
|
||||
%ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
|
||||
store i32 0, ptr addrspace(3) %ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
!amdgpu.pal.metadata.msgpack = !{!0}
|
||||
|
||||
; Function Attrs: nounwind willreturn memory(none)
|
||||
declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
|
||||
declare i64 @llvm.amdgcn.s.getpc() #2
|
||||
|
||||
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
|
||||
declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
|
||||
|
||||
attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode" "amdgpu-dynamic-vgpr-block-size"="16" }
|
||||
|
||||
attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
|
||||
|
||||
!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
|
||||
!1 = !{i32 7}
|
340
llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12-dvgpr.mir
Normal file
340
llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12-dvgpr.mir
Normal file
@ -0,0 +1,340 @@
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
|
||||
|
||||
--- |
|
||||
define amdgpu_ps void @tbuffer_store1() #0 { ret void }
|
||||
define amdgpu_ps void @tbuffer_store2() #0 { ret void }
|
||||
define amdgpu_ps void @flat_store() #0 { ret void }
|
||||
define amdgpu_ps void @global_store() #0 { ret void }
|
||||
define amdgpu_ps void @buffer_store_format() #0 { ret void }
|
||||
define amdgpu_ps void @ds_write_b32() #0 { ret void }
|
||||
define amdgpu_ps void @global_store_dword() #0 { ret void }
|
||||
define amdgpu_ps void @multiple_basic_blocks1() #0 { ret void }
|
||||
define amdgpu_ps void @multiple_basic_blocks2() #0 { ret void }
|
||||
define amdgpu_ps void @multiple_basic_blocks3() #0 { ret void }
|
||||
define amdgpu_ps void @recursive_loop() #0 { ret void }
|
||||
define amdgpu_ps void @recursive_loop_vmem() #0 { ret void }
|
||||
define amdgpu_ps void @image_store() #0 { ret void }
|
||||
define amdgpu_ps void @scratch_store() #0 { ret void }
|
||||
define amdgpu_ps void @buffer_atomic() #0 { ret void }
|
||||
define amdgpu_ps void @flat_atomic() #0 { ret void }
|
||||
define amdgpu_ps void @global_atomic() #0 { ret void }
|
||||
define amdgpu_ps void @image_atomic() #0 { ret void }
|
||||
define amdgpu_ps void @global_store_optnone() #1 { ret void }
|
||||
|
||||
attributes #0 = { "amdgpu-dynamic-vgpr-block-size" = "16" }
|
||||
attributes #1 = { "amdgpu-dynamic-vgpr-block-size" = "16" noinline optnone }
|
||||
...
|
||||
|
||||
---
|
||||
name: tbuffer_store1
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: tbuffer_store1
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: tbuffer_store2
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: tbuffer_store2
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: flat_store
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: flat_store
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: global_store
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: global_store
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
|
||||
S_WAIT_STORECNT 0
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: buffer_store_format
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: buffer_store_format
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: ds_write_b32
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: ds_write_b32
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $vgpr0 = IMPLICIT_DEF
|
||||
renamable $vgpr1 = IMPLICIT_DEF
|
||||
DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
---
|
||||
name: global_store_dword
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $sgpr0_sgpr1
|
||||
|
||||
; CHECK-LABEL: name: global_store_dword
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
|
||||
GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: multiple_basic_blocks1
|
||||
body: |
|
||||
; CHECK-LABEL: name: multiple_basic_blocks1
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
|
||||
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
successors: %bb.1, %bb.2
|
||||
|
||||
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
|
||||
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
|
||||
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
||||
S_BRANCH %bb.2
|
||||
|
||||
bb.2:
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
name: multiple_basic_blocks2
|
||||
body: |
|
||||
; CHECK-LABEL: name: multiple_basic_blocks2
|
||||
; CHECK: bb.2:
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.2
|
||||
|
||||
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
|
||||
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
|
||||
S_BRANCH %bb.2
|
||||
|
||||
bb.1:
|
||||
successors: %bb.2
|
||||
|
||||
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
|
||||
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
|
||||
S_BRANCH %bb.2
|
||||
|
||||
bb.2:
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: multiple_basic_blocks3
|
||||
body: |
|
||||
; CHECK-LABEL: name: multiple_basic_blocks3
|
||||
; CHECK: bb.4:
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.2
|
||||
|
||||
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
|
||||
TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
|
||||
S_BRANCH %bb.2
|
||||
|
||||
bb.1:
|
||||
successors: %bb.2
|
||||
|
||||
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
|
||||
S_BRANCH %bb.2
|
||||
|
||||
bb.2:
|
||||
successors: %bb.4
|
||||
|
||||
S_BRANCH %bb.4
|
||||
|
||||
bb.3:
|
||||
successors: %bb.4
|
||||
|
||||
$vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
|
||||
S_BRANCH %bb.4
|
||||
|
||||
bb.4:
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: recursive_loop
|
||||
body: |
|
||||
; CHECK-LABEL: name: recursive_loop
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
|
||||
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
successors: %bb.1, %bb.2
|
||||
|
||||
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
|
||||
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
||||
S_BRANCH %bb.2
|
||||
|
||||
bb.2:
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: recursive_loop_vmem
|
||||
body: |
|
||||
; CHECK-LABEL: name: recursive_loop_vmem
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
|
||||
renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
|
||||
S_BRANCH %bb.1
|
||||
|
||||
bb.1:
|
||||
successors: %bb.1, %bb.2
|
||||
|
||||
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
|
||||
S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
|
||||
S_CBRANCH_SCC1 %bb.1, implicit killed $scc
|
||||
S_BRANCH %bb.2
|
||||
|
||||
bb.2:
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: image_store
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: image_store
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: scratch_store
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: scratch_store
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
|
||||
SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: buffer_atomic
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: buffer_atomic
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: flat_atomic
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: flat_atomic
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
|
||||
---
|
||||
name: global_atomic
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: global_atomic
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: image_atomic
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: image_atomic
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK: S_ALLOC_VGPR 0
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
---
|
||||
name: global_store_optnone
|
||||
body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: global_store_optnone
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
|
||||
S_WAIT_STORECNT 0
|
||||
S_ENDPGM 0
|
||||
...
|
@ -1,5 +1,4 @@
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
|
||||
|
||||
--- |
|
||||
define amdgpu_ps void @tbuffer_store1() { ret void }
|
||||
@ -29,8 +28,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: tbuffer_store1
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
@ -42,8 +40,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: tbuffer_store2
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
|
||||
S_ENDPGM 0
|
||||
@ -55,8 +52,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: flat_store
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
@ -68,8 +64,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: global_store
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
|
||||
S_WAIT_STORECNT 0
|
||||
@ -82,8 +77,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: buffer_store_format
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
@ -95,8 +89,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: ds_write_b32
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $vgpr0 = IMPLICIT_DEF
|
||||
renamable $vgpr1 = IMPLICIT_DEF
|
||||
@ -112,8 +105,7 @@ body: |
|
||||
|
||||
; CHECK-LABEL: name: global_store_dword
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
|
||||
GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
|
||||
@ -125,8 +117,7 @@ name: multiple_basic_blocks1
|
||||
body: |
|
||||
; CHECK-LABEL: name: multiple_basic_blocks1
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
@ -153,8 +144,7 @@ body: |
|
||||
; CHECK-LABEL: name: multiple_basic_blocks2
|
||||
; CHECK: bb.2:
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.2
|
||||
@ -180,8 +170,7 @@ body: |
|
||||
; CHECK-LABEL: name: multiple_basic_blocks3
|
||||
; CHECK: bb.4:
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.2
|
||||
@ -216,8 +205,7 @@ name: recursive_loop
|
||||
body: |
|
||||
; CHECK-LABEL: name: recursive_loop
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
@ -241,8 +229,7 @@ name: recursive_loop_vmem
|
||||
body: |
|
||||
; CHECK-LABEL: name: recursive_loop_vmem
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
bb.0:
|
||||
successors: %bb.1
|
||||
@ -268,8 +255,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: image_store
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
|
||||
S_ENDPGM 0
|
||||
@ -281,8 +267,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: scratch_store
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
|
||||
SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
|
||||
@ -295,8 +280,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: buffer_atomic
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
|
||||
S_ENDPGM 0
|
||||
@ -308,8 +292,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: flat_atomic
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
|
||||
S_ENDPGM 0
|
||||
@ -322,8 +305,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: global_atomic
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
|
||||
S_ENDPGM 0
|
||||
@ -335,8 +317,7 @@ body: |
|
||||
bb.0:
|
||||
; CHECK-LABEL: name: image_atomic
|
||||
; CHECK-NOT: S_SENDMSG 3
|
||||
; DEFAULT-NOT: S_ALLOC_VGPR
|
||||
; DVGPR: S_ALLOC_VGPR 0
|
||||
; CHECK-NOT: S_ALLOC_VGPR
|
||||
; CHECK: S_ENDPGM 0
|
||||
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
|
||||
S_ENDPGM 0
|
||||
|
@ -44,6 +44,7 @@
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
||||
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
|
||||
@ -312,6 +313,7 @@
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
||||
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
|
||||
|
@ -44,6 +44,7 @@
|
||||
; AFTER-PEI-NEXT: sgprForEXECCopy: ''
|
||||
; AFTER-PEI-NEXT: longBranchReservedReg: ''
|
||||
; AFTER-PEI-NEXT: hasInitWholeWave: false
|
||||
; AFTER-PEI-NEXT: dynamicVGPRBlockSize: 0
|
||||
; AFTER-PEI-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
; AFTER-PEI-NEXT: body:
|
||||
define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
|
@ -44,6 +44,7 @@
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
||||
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
|
||||
|
@ -44,6 +44,7 @@
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
||||
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {
|
||||
|
@ -53,6 +53,7 @@
|
||||
# FULL-NEXT: sgprForEXECCopy: ''
|
||||
# FULL-NEXT: longBranchReservedReg: ''
|
||||
# FULL-NEXT: hasInitWholeWave: false
|
||||
# FULL-NEXT: dynamicVGPRBlockSize: 0
|
||||
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
# FULL-NEXT: body:
|
||||
|
||||
@ -159,6 +160,7 @@ body: |
|
||||
# FULL-NEXT: sgprForEXECCopy: ''
|
||||
# FULL-NEXT: longBranchReservedReg: ''
|
||||
# FULL-NEXT: hasInitWholeWave: false
|
||||
# FULL-NEXT: dynamicVGPRBlockSize: 0
|
||||
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
# FULL-NEXT: body:
|
||||
|
||||
@ -236,6 +238,7 @@ body: |
|
||||
# FULL-NEXT: sgprForEXECCopy: ''
|
||||
# FULL-NEXT: longBranchReservedReg: ''
|
||||
# FULL-NEXT: hasInitWholeWave: false
|
||||
# FULL-NEXT: dynamicVGPRBlockSize: 0
|
||||
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
# FULL-NEXT: body:
|
||||
|
||||
@ -314,6 +317,7 @@ body: |
|
||||
# FULL-NEXT: sgprForEXECCopy: ''
|
||||
# FULL-NEXT: longBranchReservedReg: ''
|
||||
# FULL-NEXT: hasInitWholeWave: false
|
||||
# FULL-NEXT: dynamicVGPRBlockSize: 0
|
||||
# FULL-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
# FULL-NEXT: body:
|
||||
|
||||
|
@ -54,6 +54,7 @@
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
||||
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
|
||||
@ -102,6 +103,7 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
||||
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
; CHECK-NEXT: body:
|
||||
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
|
||||
@ -174,6 +176,7 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
||||
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
; CHECK-NEXT: body:
|
||||
define void @function() {
|
||||
@ -228,6 +231,7 @@ define void @function() {
|
||||
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
||||
; CHECK-NEXT: longBranchReservedReg: ''
|
||||
; CHECK-NEXT: hasInitWholeWave: false
|
||||
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
||||
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
||||
; CHECK-NEXT: body:
|
||||
define void @function_nsz() #0 {
|
||||
|
@ -93,16 +93,16 @@ static const std::pair<StringRef, StringRef>
|
||||
W32FS = {"+wavefrontsize32", "w32"},
|
||||
W64FS = {"+wavefrontsize64", "w64"};
|
||||
|
||||
using TestFuncTy =
|
||||
function_ref<bool(std::stringstream &, unsigned, const GCNSubtarget &)>;
|
||||
using TestFuncTy = function_ref<bool(std::stringstream &, unsigned,
|
||||
const GCNSubtarget &, bool)>;
|
||||
|
||||
static bool testAndRecord(std::stringstream &Table, const GCNSubtarget &ST,
|
||||
TestFuncTy test) {
|
||||
TestFuncTy test, unsigned DynamicVGPRBlockSize) {
|
||||
bool Success = true;
|
||||
unsigned MaxOcc = ST.getMaxWavesPerEU();
|
||||
for (unsigned Occ = MaxOcc; Occ > 0; --Occ) {
|
||||
Table << std::right << std::setw(3) << Occ << " ";
|
||||
Success = test(Table, Occ, ST) && Success;
|
||||
Success = test(Table, Occ, ST, DynamicVGPRBlockSize) && Success;
|
||||
Table << '\n';
|
||||
}
|
||||
return Success;
|
||||
@ -132,7 +132,7 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
|
||||
FS = &W32FS;
|
||||
|
||||
std::stringstream Table;
|
||||
bool Success = testAndRecord(Table, ST, test);
|
||||
bool Success = testAndRecord(Table, ST, test, /*DynamicVGPRBlockSize=*/0);
|
||||
if (!Success || PrintCpuRegLimits)
|
||||
TablePerCPUs[Table.str()].push_back((CanonCPUName + FS->second).str());
|
||||
|
||||
@ -155,40 +155,50 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
|
||||
|
||||
static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
|
||||
TestFuncTy test) {
|
||||
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
|
||||
"+dynamic-vgpr," + FS.str());
|
||||
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
|
||||
ASSERT_TRUE(TM) << "No target machine";
|
||||
|
||||
GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
|
||||
std::string(TM->getTargetFeatureString()), *TM);
|
||||
ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
|
||||
|
||||
std::stringstream Table;
|
||||
bool Success = testAndRecord(Table, ST, test);
|
||||
EXPECT_TRUE(Success && !PrintCpuRegLimits)
|
||||
<< CPUName << " dynamic VGPR " << FS
|
||||
<< ":\nOcc MinVGPR MaxVGPR\n"
|
||||
<< Table.str() << '\n';
|
||||
auto testWithBlockSize = [&](unsigned DynamicVGPRBlockSize) {
|
||||
std::stringstream Table;
|
||||
bool Success = testAndRecord(Table, ST, test, DynamicVGPRBlockSize);
|
||||
EXPECT_TRUE(Success && !PrintCpuRegLimits)
|
||||
<< CPUName << " dynamic VGPR block size " << DynamicVGPRBlockSize
|
||||
<< ":\nOcc MinVGPR MaxVGPR\n"
|
||||
<< Table.str() << '\n';
|
||||
};
|
||||
|
||||
testWithBlockSize(16);
|
||||
testWithBlockSize(32);
|
||||
}
|
||||
|
||||
TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
|
||||
auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
|
||||
unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
|
||||
auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST,
|
||||
unsigned DynamicVGPRBlockSize) {
|
||||
unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(DynamicVGPRBlockSize);
|
||||
return checkMinMax(
|
||||
OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum), ST.getMaxWavesPerEU(),
|
||||
[&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs); },
|
||||
[&](unsigned Occ) { return ST.getMinNumVGPRs(Occ); },
|
||||
[&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ); });
|
||||
OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum, DynamicVGPRBlockSize),
|
||||
ST.getMaxWavesPerEU(),
|
||||
[&](unsigned NumGPRs) {
|
||||
return ST.getOccupancyWithNumVGPRs(NumGPRs, DynamicVGPRBlockSize);
|
||||
},
|
||||
[&](unsigned Occ) {
|
||||
return ST.getMinNumVGPRs(Occ, DynamicVGPRBlockSize);
|
||||
},
|
||||
[&](unsigned Occ) {
|
||||
return ST.getMaxNumVGPRs(Occ, DynamicVGPRBlockSize);
|
||||
});
|
||||
};
|
||||
|
||||
testGPRLimits("VGPR", true, test);
|
||||
|
||||
testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
|
||||
testDynamicVGPRLimits("gfx1200",
|
||||
"+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
|
||||
}
|
||||
|
||||
static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
|
||||
unsigned DynamicVGPRBlockSize,
|
||||
unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
|
||||
unsigned ExpectedMaxVGPRs) {
|
||||
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
|
||||
@ -206,11 +216,15 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
|
||||
Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
|
||||
Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
|
||||
|
||||
std::string DVGPRBlockSize = std::to_string(DynamicVGPRBlockSize);
|
||||
if (DynamicVGPRBlockSize)
|
||||
Func->addFnAttr("amdgpu-dynamic-vgpr-block-size", DVGPRBlockSize);
|
||||
|
||||
auto Range = ST.getWavesPerEU(*Func);
|
||||
EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
|
||||
EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
|
||||
EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
|
||||
EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
|
||||
EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs(DynamicVGPRBlockSize))
|
||||
<< CPUName << ' ' << FS;
|
||||
|
||||
// Function with requested 'amdgpu-waves-per-eu' in a valid range.
|
||||
@ -221,11 +235,10 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
|
||||
}
|
||||
|
||||
TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
|
||||
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
|
||||
testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
|
||||
testAbsoluteLimits(
|
||||
"gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
|
||||
1, 16, 256);
|
||||
// CPUName, Features, DynamicVGPRBlockSize; Expected MinOcc, MaxOcc, MaxVGPRs
|
||||
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 0, 1, 16, 256);
|
||||
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 16, 1, 16, 128);
|
||||
testAbsoluteLimits("gfx1200", "+wavefrontsize32", 32, 1, 16, 256);
|
||||
}
|
||||
|
||||
static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user