llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
michaelselehov cb3fbe921b
[AMDGPU] Set preferred function alignment based on icache geometry (#183064)
Non-entry functions were unconditionally aligned to 4 bytes with no
architecture-specific preferred alignment, and setAlignment() was used
instead of ensureAlignment(), overwriting any explicit IR attributes.

Add instruction cache line size and fetch alignment data to GCNSubtarget
for each generation (GFX9: 64B/32B, GFX10: 64B/4B, GFX11+: 128B/4B). Use
this to call setPrefFunctionAlignment() in SITargetLowering, aligning
non-entry functions to the cache line size by default. Change
setAlignment to ensureAlignment in AMDGPUAsmPrinter so explicit IR align
attributes are respected.

Empirical thread trace analysis on gfx942, gfx1030, gfx1100, and gfx1200
showed that only GFX9 exhibits measurable fetch stalls when functions
cross the 32-byte fetch window boundary. GFX10+ showed no alignment
sensitivity. A hidden option -amdgpu-align-functions-for-fetch-only is
provided to use the fetch granularity instead of cache line size.

Assisted-by: Claude Opus
2026-03-11 07:57:37 -04:00

1799 lines
72 KiB
C++

//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
///
/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
/// code. When passed an MCAsmStreamer it prints assembly and when passed
/// an MCObjectStreamer it outputs binary code.
//
//===----------------------------------------------------------------------===//
//
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
#include "AMDGPUHSAMetadataStreamer.h"
#include "AMDGPUMCResourceInfo.h"
#include "AMDGPUResourceUsageAnalysis.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCExpr.h"
#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "R600AsmPrinter.h"
#include "SIMachineFunctionInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDKernelCodeTUtils.h"
#include "Utils/SIDefinesUtils.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCValue.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
// This should get the default rounding mode from the kernel. We just set the
// default here, but this could change if the OpenCL rounding mode pragmas are
// used.
//
// The denormal mode here should match what is reported by the OpenCL runtime
// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
// can also be override to flush with the -cl-denorms-are-zero compiler flag.
//
// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
// precision, and leaves single precision to flush all and does not report
// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
// CL_FP_DENORM for both.
//
// FIXME: It seems some instructions do not support single precision denormals
// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
// and sin_f32, cos_f32 on most parts).
// We want to use these instructions, and using fp32 denormals also causes
// instructions to run at the double precision rate for the device so it's
// probably best to just report no single precision denormals.
static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
}
static AsmPrinter *
createAMDGPUAsmPrinterPass(TargetMachine &tm,
std::unique_ptr<MCStreamer> &&Streamer) {
return new AMDGPUAsmPrinter(tm, std::move(Streamer));
}
extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
LLVMInitializeAMDGPUAsmPrinter() {
TargetRegistry::RegisterAsmPrinter(getTheR600Target(),
llvm::createR600AsmPrinterPass);
TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
createAMDGPUAsmPrinterPass);
}
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {
assert(OutStreamer && "AsmPrinter constructed without streamer");
}
StringRef AMDGPUAsmPrinter::getPassName() const {
return "AMDGPU Assembly Printer";
}
const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
return TM.getMCSubtargetInfo();
}
AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
if (!OutStreamer)
return nullptr;
return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
}
void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
IsTargetStreamerInitialized = false;
}
void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
IsTargetStreamerInitialized = true;
// TODO: Which one is called first, emitStartOfAsmFile or
// emitFunctionBodyStart?
if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
initializeTargetID(M);
if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
TM.getTargetTriple().getOS() != Triple::AMDPAL)
return;
getTargetStreamer()->EmitDirectiveAMDGCNTarget();
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
CodeObjectVersion);
HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
}
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
getTargetStreamer()->getPALMetadata()->readFromIR(M);
}
void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
// Init target streamer if it has not yet happened
if (!IsTargetStreamerInitialized)
initTargetStreamer(M);
if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
getTargetStreamer()->EmitISAVersion();
// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
// Emit HSA Metadata (NT_AMD_HSA_METADATA).
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
HSAMetadataStream->end();
bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
(void)Success;
assert(Success && "Malformed HSA Metadata");
}
}
void AMDGPUAsmPrinter::emitFunctionBodyStart() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
const Function &F = MF->getFunction();
// TODO: We're checking this late, would be nice to check it earlier.
if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
reportFatalUsageError(
STM.getCPU() + " is only available on code object version 6 or better");
}
// TODO: Which one is called first, emitStartOfAsmFile or
// emitFunctionBodyStart?
if (!getTargetStreamer()->getTargetID())
initializeTargetID(*F.getParent());
const auto &FunctionTargetID = STM.getTargetID();
// Make sure function's xnack settings are compatible with module's
// xnack settings.
if (FunctionTargetID.isXnackSupported() &&
FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
"' function does not match module xnack setting");
return;
}
// Make sure function's sramecc settings are compatible with module's
// sramecc settings.
if (FunctionTargetID.isSramEccSupported() &&
FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
"' function does not match module sramecc setting");
return;
}
if (!MFI.isEntryFunction())
return;
if (STM.isMesaKernel(F) &&
(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
AMDGPUMCKernelCodeT KernelCode;
getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
KernelCode.validate(&STM, MF->getContext());
getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
}
if (STM.isAmdHsaOS())
HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
}
void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
if (!MFI.isEntryFunction())
return;
if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
return;
auto &Streamer = getTargetStreamer()->getStreamer();
auto &Context = Streamer.getContext();
auto &ObjectFileInfo = *Context.getObjectFileInfo();
auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
Streamer.pushSection();
Streamer.switchSection(&ReadOnlySection);
// CP microcode requires the kernel descriptor to be allocated on 64 byte
// alignment.
Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
ReadOnlySection.ensureMinAlignment(Align(64));
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
SmallString<128> KernelName;
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
CurrentProgramInfo.NumVGPRsForWavesPerEU,
MCBinaryExpr::createSub(
CurrentProgramInfo.NumSGPRsForWavesPerEU,
AMDGPUMCExpr::createExtraSGPRs(
CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
Context),
CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
Streamer.popSection();
}
void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
Register RegNo = MI->getOperand(0).getReg();
SmallString<128> Str;
raw_svector_ostream OS(Str);
OS << "implicit-def: "
<< printReg(RegNo, MF->getSubtarget().getRegisterInfo());
if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
OS << " : SGPR spill to VGPR lane";
OutStreamer->AddComment(OS.str());
OutStreamer->addBlankLine();
}
void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
AsmPrinter::emitFunctionEntryLabel();
return;
}
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
SmallString<128> SymbolName;
getNameWithPrefix(SymbolName, &MF->getFunction()),
getTargetStreamer()->EmitAMDGPUSymbolType(
SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
}
if (DumpCodeInstEmitter) {
// Disassemble function name label to text.
DisasmLines.push_back(MF->getName().str() + ":");
DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
HexLines.emplace_back("");
}
AsmPrinter::emitFunctionEntryLabel();
}
void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
// Write a line for the basic block label if it is not only fallthrough.
DisasmLines.push_back(
(Twine("BB") + Twine(getFunctionNumber())
+ "_" + Twine(MBB.getNumber()) + ":").str());
DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
HexLines.emplace_back("");
}
AsmPrinter::emitBasicBlockStart(MBB);
}
void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
OutContext.reportError({},
Twine(GV->getName()) +
": unsupported initializer for address space");
return;
}
// LDS variables aren't emitted in HSA or PAL yet.
const Triple::OSType OS = TM.getTargetTriple().getOS();
if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
return;
MCSymbol *GVSym = getSymbol(GV);
GVSym->redefineIfPossible();
if (GVSym->isDefined() || GVSym->isVariable())
report_fatal_error("symbol '" + Twine(GVSym->getName()) +
"' is already defined");
const DataLayout &DL = GV->getDataLayout();
uint64_t Size = GV->getGlobalSize(DL);
Align Alignment = GV->getAlign().value_or(Align(4));
emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
emitLinkage(GV, GVSym);
auto *TS = getTargetStreamer();
TS->emitAMDGPULDS(GVSym, Size, Alignment);
return;
}
AsmPrinter::emitGlobalVariable(GV);
}
bool AMDGPUAsmPrinter::doInitialization(Module &M) {
CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
switch (CodeObjectVersion) {
case AMDGPU::AMDHSA_COV4:
HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
break;
case AMDGPU::AMDHSA_COV5:
HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
break;
case AMDGPU::AMDHSA_COV6:
HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
break;
default:
reportFatalUsageError("unsupported code object version");
}
}
return AsmPrinter::doInitialization(M);
}
/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
///
/// Remove dependency on GCNSubtarget and depend only only the necessary values
/// for said occupancy computation. Should match computeOccupancy implementation
/// without passing \p STM on.
const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
const MCExpr *NumVGPRs,
unsigned DynamicVGPRBlockSize,
const GCNSubtarget &STM, MCContext &Ctx) {
unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
unsigned Generation = STM.getGeneration();
auto CreateExpr = [&Ctx](unsigned Value) {
return MCConstantExpr::create(Value, Ctx);
};
return AMDGPUMCExpr::create(AMDGPUMCExpr::AGVK_Occupancy,
{CreateExpr(MaxWaves), CreateExpr(Granule),
CreateExpr(TargetTotalNumVGPRs),
CreateExpr(Generation), CreateExpr(InitOcc),
NumSGPRs, NumVGPRs},
Ctx);
}
void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
return;
using RIK = MCResourceInfo::ResourceInfoKind;
const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
MCSymbol *FnSym = TM.getSymbol(&F);
bool IsLocal = F.hasLocalLinkage();
auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
int64_t Val;
if (Value->evaluateAsAbsolute(Val)) {
Res = Val;
return true;
}
return false;
};
const uint64_t MaxScratchPerWorkitem =
STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
MCSymbol *ScratchSizeSymbol = RI.getSymbol(
FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal);
uint64_t ScratchSize;
if (ScratchSizeSymbol->isVariable() &&
TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
ScratchSize > MaxScratchPerWorkitem) {
DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
DS_Error);
F.getContext().diagnose(DiagStackSize);
}
// Validate addressable scalar registers (i.e., prior to added implicit
// SGPRs).
MCSymbol *NumSGPRSymbol =
RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal);
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
!STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
uint64_t NumSgpr;
if (NumSGPRSymbol->isVariable() &&
TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
NumSgpr > MaxAddressableNumSGPRs) {
DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
NumSgpr, MaxAddressableNumSGPRs,
DS_Error, DK_ResourceLimit);
F.getContext().diagnose(Diag);
return;
}
}
MCSymbol *VCCUsedSymbol =
RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal);
MCSymbol *FlatUsedSymbol = RI.getSymbol(
FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
uint64_t VCCUsed, FlatUsed, NumSgpr;
if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
FlatUsedSymbol->isVariable() &&
TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
// Recomputes NumSgprs + implicit SGPRs but all symbols should now be
// resolvable.
NumSgpr += IsaInfo::getNumExtraSGPRs(
&STM, VCCUsed, FlatUsed,
getTargetStreamer()->getTargetID()->isXnackOnOrAny());
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
if (NumSgpr > MaxAddressableNumSGPRs) {
DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
MaxAddressableNumSGPRs, DS_Error,
DK_ResourceLimit);
F.getContext().diagnose(Diag);
return;
}
}
MCSymbol *NumVgprSymbol =
RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal);
MCSymbol *NumAgprSymbol =
RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal);
uint64_t NumVgpr, NumAgpr;
MachineModuleInfo &MMI =
getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
MachineFunction *MF = MMI.getMachineFunction(F);
if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
unsigned MaxWaves = MFI.getMaxWavesPerEU();
uint64_t TotalNumVgpr =
getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
uint64_t NumVGPRsForWavesPerEU =
std::max({TotalNumVgpr, (uint64_t)1,
(uint64_t)STM.getMinNumVGPRs(
MaxWaves, MFI.getDynamicVGPRBlockSize())});
uint64_t NumSGPRsForWavesPerEU = std::max(
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
const MCExpr *OccupancyExpr = createOccupancy(
STM.getOccupancyWithWorkGroupSizes(*MF).second,
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
MFI.getDynamicVGPRBlockSize(), STM, OutContext);
uint64_t Occupancy;
const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
F, "amdgpu-waves-per-eu", {0, 0}, true);
if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
DiagnosticInfoOptimizationFailure Diag(
F, F.getSubprogram(),
"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
"'" +
F.getName() + "': desired occupancy was " + Twine(MinWEU) +
", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
return;
}
}
}
}
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Pad with s_code_end to help tools and guard against instruction prefetch
// causing stale data in caches. Arguably this should be done by the linker,
// which is why this isn't done for Mesa.
// Don't do it if there is no code.
const MCSubtargetInfo &STI = *getGlobalSTI();
if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
(STI.getTargetTriple().getOS() == Triple::AMDHSA ||
STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
MCSection *TextSect = getObjFileLowering().getTextSection();
if (TextSect->hasInstructions()) {
OutStreamer->switchSection(TextSect);
getTargetStreamer()->EmitCodeEnd(STI);
}
}
// Assign expressions which can only be resolved when all other functions are
// known.
RI.finalize(OutContext);
// Switch section and emit all GPR maximums within the processed module.
OutStreamer->pushSection();
MCSectionELF *MaxGPRSection =
OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
OutStreamer->switchSection(MaxGPRSection);
getTargetStreamer()->EmitMCResourceMaximums(
RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
OutStreamer->popSection();
for (Function &F : M.functions())
validateMCResourceInfo(F);
RI.reset();
return AsmPrinter::doFinalization(M);
}
SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
SmallString<128> Str;
raw_svector_ostream OSS(Str);
auto &Streamer = getTargetStreamer()->getStreamer();
auto &Context = Streamer.getContext();
const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
printAMDGPUMCExpr(New, OSS, MAI);
return Str;
}
// Print comments that apply to both callable functions and entry points.
void AMDGPUAsmPrinter::emitCommonFunctionComments(
const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
const AMDGPUMachineFunction *MFI) {
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
false);
OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
if (NumAGPR && TotalNumVGPR) {
OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
false);
}
OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
false);
OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
false);
}
const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
MCContext &Ctx = MF.getContext();
uint16_t KernelCodeProperties = 0;
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
}
if (UserSGPRInfo.hasDispatchPtr()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
}
if (UserSGPRInfo.hasQueuePtr()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
}
if (UserSGPRInfo.hasKernargSegmentPtr()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
}
if (UserSGPRInfo.hasDispatchID()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
}
if (UserSGPRInfo.hasFlatScratchInit()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
}
if (UserSGPRInfo.hasPrivateSegmentSize()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
}
if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
// un-evaluatable at this point so it cannot be conditionally checked here.
// Instead, we'll directly shift the possibly unknown MCExpr into its place
// and bitwise-or it into KernelCodeProperties.
const MCExpr *KernelCodePropExpr =
MCConstantExpr::create(KernelCodeProperties, Ctx);
const MCExpr *OrValue = MCConstantExpr::create(
amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
OrValue, Ctx);
KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
return KernelCodePropExpr;
}
MCKernelDescriptor
AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
const SIProgramInfo &PI) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const Function &F = MF.getFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
MCContext &Ctx = MF.getContext();
MCKernelDescriptor KernelDescriptor;
KernelDescriptor.group_segment_fixed_size =
MCConstantExpr::create(PI.LDSSize, Ctx);
KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
Align MaxKernArgAlign;
KernelDescriptor.kernarg_size = MCConstantExpr::create(
STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
int64_t PGM_Rsrc3 = 1;
bool EvaluatableRsrc3 =
CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGM_Rsrc3);
(void)PGM_Rsrc3;
(void)EvaluatableRsrc3;
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
static_cast<uint64_t>(PGM_Rsrc3) == 0);
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
Ctx);
return KernelDescriptor;
}
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// Init target streamer lazily on the first function so that previous passes
// can set metadata.
if (!IsTargetStreamerInitialized)
initTargetStreamer(*MF.getFunction().getParent());
ResourceUsage =
&getAnalysis<AMDGPUResourceUsageAnalysisWrapperPass>().getResourceInfo();
CurrentProgramInfo.reset(MF);
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
MCContext &Ctx = MF.getContext();
// The starting address of all shader programs must be 256 bytes aligned.
// Regular functions just need the basic required instruction alignment.
MF.ensureAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
SetupMachineFunction(MF);
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
MCContext &Context = getObjFileLowering().getContext();
bool IsLocal = MF.getFunction().hasLocalLinkage();
// FIXME: This should be an explicit check for Mesa.
if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
MCSectionELF *ConfigSection =
Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
OutStreamer->switchSection(ConfigSection);
}
RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
if (MFI->isModuleEntryFunction()) {
getSIProgramInfo(CurrentProgramInfo, MF);
}
if (STM.isAmdPalOS()) {
if (MFI->isEntryFunction())
EmitPALMetadata(MF, CurrentProgramInfo);
else if (MFI->isModuleEntryFunction())
emitPALFunctionMetadata(MF);
} else if (!STM.isAmdHsaOS()) {
EmitProgramInfoSI(MF, CurrentProgramInfo);
}
DumpCodeInstEmitter = nullptr;
if (STM.dumpCode()) {
// For -dumpcode, get the assembler out of the streamer. This only works
// with -filetype=obj.
MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
if (Assembler)
DumpCodeInstEmitter = Assembler->getEmitterPtr();
}
DisasmLines.clear();
HexLines.clear();
DisasmLineMaxLen = 0;
emitFunctionBody();
emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
STM.hasMAIInsts());
{
using RIK = MCResourceInfo::ResourceInfoKind;
getTargetStreamer()->EmitMCResourceInfo(
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext,
IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
OutContext, IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
OutContext, IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
OutContext, IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
OutContext, IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext,
IsLocal),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
OutContext, IsLocal));
}
// Emit _dvgpr$ symbol when appropriate.
emitDVgprSymbol(MF);
if (isVerbose()) {
MCSectionELF *CommentSection =
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
OutStreamer->switchSection(CommentSection);
if (!MFI->isEntryFunction()) {
using RIK = MCResourceInfo::ResourceInfoKind;
OutStreamer->emitRawComment(" Function info:", false);
emitCommonFunctionComments(
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
IsLocal)
->getVariableValue(),
STM.hasMAIInsts()
? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR,
OutContext, IsLocal)
->getVariableValue()
: nullptr,
RI.createTotalNumVGPRs(MF, Ctx),
RI.createTotalNumSGPRs(
MF,
MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
Ctx),
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
OutContext, IsLocal)
->getVariableValue(),
CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
return false;
}
OutStreamer->emitRawComment(" Kernel info:", false);
emitCommonFunctionComments(
CurrentProgramInfo.NumArchVGPR,
STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
CurrentProgramInfo.ScratchSize,
CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
OutStreamer->emitRawComment(
" IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
OutStreamer->emitRawComment(
" LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
" bytes/workgroup (compile time only)", false);
OutStreamer->emitRawComment(
" SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
OutStreamer->emitRawComment(
" VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
OutStreamer->emitRawComment(
" NumSGPRsForWavesPerEU: " +
getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
false);
OutStreamer->emitRawComment(
" NumVGPRsForWavesPerEU: " +
getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
false);
if (STM.hasGFX90AInsts()) {
const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
AdjustedAccum = MCBinaryExpr::createMul(
AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
OutStreamer->emitRawComment(
" AccumOffset: " + getMCExprStr(AdjustedAccum), false);
}
if (STM.hasGFX1250Insts())
OutStreamer->emitRawComment(
" NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
false);
OutStreamer->emitRawComment(
" Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
getMCExprStr(CurrentProgramInfo.ScratchEnable),
false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
Twine(CurrentProgramInfo.UserSGPR),
false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
Twine(CurrentProgramInfo.TrapHandlerEnable),
false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
Twine(CurrentProgramInfo.TGIdXEnable),
false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
Twine(CurrentProgramInfo.TGIdYEnable),
false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
Twine(CurrentProgramInfo.TGIdZEnable),
false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
Twine(CurrentProgramInfo.TIdIGCompCount),
false);
[[maybe_unused]] int64_t PGMRSrc3;
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
static_cast<uint64_t>(PGMRSrc3) == 0));
if (STM.hasGFX90AInsts()) {
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
getMCExprStr(MCKernelDescriptor::bits_get(
CurrentProgramInfo.ComputePGMRSrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
getMCExprStr(MCKernelDescriptor::bits_get(
CurrentProgramInfo.ComputePGMRSrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
false);
}
}
if (DumpCodeInstEmitter) {
OutStreamer->switchSection(
Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
for (size_t i = 0; i < DisasmLines.size(); ++i) {
std::string Comment = "\n";
if (!HexLines[i].empty()) {
Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
Comment += " ; " + HexLines[i] + "\n";
}
OutStreamer->emitBytes(StringRef(DisasmLines[i]));
OutStreamer->emitBytes(StringRef(Comment));
}
}
return false;
}
// When appropriate, add a _dvgpr$ symbol, with the value of the function
// symbol, plus an offset encoding one less than the number of VGPR blocks used
// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
// used by a front-end to have functions that are chained rather than called,
// and a dispatcher that dynamically resizes the VGPR count before dispatching
// to a function.
void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
if (MFI.isDynamicVGPREnabled() &&
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS_Chain) {
MCContext &Ctx = MF.getContext();
unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
MCValue NumVGPRs;
if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
NumVGPRs, nullptr) ||
!NumVGPRs.isAbsolute()) {
llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
}
// Calculate number of VGPR blocks.
// Treat 0 VGPRs as 1 VGPR to avoid underflowing.
unsigned NumBlocks =
divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
if (NumBlocks > 8) {
OutContext.reportError({},
"too many DVGPR blocks for _dvgpr$ symbol for '" +
Twine(CurrentFnSym->getName()) + "'");
return;
}
unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
// Add to function symbol to create _dvgpr$ symbol.
const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
MCSymbolRefExpr::create(CurrentFnSym, Ctx),
MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
MCSymbol *DVgprFuncSym =
Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
emitLinkage(&MF.getFunction(), DVgprFuncSym);
}
}
// TODO: Fold this into emitFunctionBodyStart.
void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
// In the beginning all features are either 'Any' or 'NotSupported',
// depending on global target features. This will cover empty modules.
getTargetStreamer()->initializeTargetID(*getGlobalSTI(),
getGlobalSTI()->getFeatureString());
// If module is empty, we are done.
if (M.empty())
return;
// If module is not empty, need to find first 'Off' or 'On' feature
// setting per feature from functions in module.
for (auto &F : M) {
auto &TSTargetID = getTargetStreamer()->getTargetID();
if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
(!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
break;
const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
if (TSTargetID->isXnackSupported())
if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
if (TSTargetID->isSramEccSupported())
if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
}
}
// AccumOffset computed for the MCExpr equivalent of:
// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
// Can't be lower than 1 for subsequent alignTo.
const MCExpr *MaximumTaken =
AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
// Practically, it's computing divideCeil(MaximumTaken, 4).
const MCExpr *DivCeil = MCBinaryExpr::createDiv(
AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
Ctx);
return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
}
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
bool IsLocal = MF.getFunction().hasLocalLinkage();
MCContext &Ctx = MF.getContext();
auto CreateExpr = [&Ctx](int64_t Value) {
return MCConstantExpr::create(Value, Ctx);
};
auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
int64_t Val;
if (Value->evaluateAsAbsolute(Val)) {
Res = Val;
return true;
}
return false;
};
auto GetSymRefExpr =
[&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
MCSymbol *Sym =
RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal);
return MCSymbolRefExpr::create(Sym, Ctx);
};
using RIK = MCResourceInfo::ResourceInfoKind;
ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
ProgInfo.TgSplit = STM.isTgSplitEnabled();
ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
ProgInfo.DynamicCallStack =
MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// The calculations related to SGPR/VGPR blocks are
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
ProgInfo.VCCUsed, ProgInfo.FlatUsed,
getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
// Check the addressable register limit before we add ExtraSGPRs.
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
!STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
uint64_t NumSgpr;
if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
NumSgpr > MaxAddressableNumSGPRs) {
// This can happen due to a compiler bug or when using inline asm.
LLVMContext &Ctx = MF.getFunction().getContext();
DiagnosticInfoResourceLimit Diag(
MF.getFunction(), "addressable scalar registers", NumSgpr,
MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
Ctx.diagnose(Diag);
ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
}
}
// Account for extra SGPRs and VGPRs reserved for debugger use.
ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
const Function &F = MF.getFunction();
// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
// dispatch registers as function args.
unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
if (WaveDispatchNumSGPR) {
ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumSGPR,
MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
Ctx)},
Ctx);
}
if (WaveDispatchNumVGPR) {
ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
// number of waves per execution unit request.
unsigned MaxWaves = MFI->getMaxWavesPerEU();
ProgInfo.NumSGPRsForWavesPerEU =
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
Ctx);
ProgInfo.NumVGPRsForWavesPerEU =
AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
CreateExpr(STM.getMinNumVGPRs(
MaxWaves, MFI->getDynamicVGPRBlockSize()))},
Ctx);
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
uint64_t NumSgpr;
if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
NumSgpr > MaxAddressableNumSGPRs) {
// This can happen due to a compiler bug or when using inline asm to use
// the registers which are usually reserved for vcc etc.
LLVMContext &Ctx = MF.getFunction().getContext();
DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
NumSgpr, MaxAddressableNumSGPRs,
DS_Error, DK_ResourceLimit);
Ctx.diagnose(Diag);
ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
}
}
if (STM.hasSGPRInitBug()) {
ProgInfo.NumSGPR =
CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
ProgInfo.NumSGPRsForWavesPerEU =
CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
}
if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
LLVMContext &Ctx = MF.getFunction().getContext();
DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
MFI->getNumUserSGPRs(),
STM.getMaxNumUserSGPRs(), DS_Error);
Ctx.diagnose(Diag);
}
if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
LLVMContext &Ctx = MF.getFunction().getContext();
DiagnosticInfoResourceLimit Diag(
MF.getFunction(), "local memory", MFI->getLDSSize(),
STM.getAddressableLocalMemorySize(), DS_Error);
Ctx.diagnose(Diag);
}
// The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
// (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
unsigned Granule) {
const MCExpr *OneConst = CreateExpr(1ul);
const MCExpr *GranuleConst = CreateExpr(Granule);
const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
const MCExpr *AlignToGPR =
AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
const MCExpr *DivGPR =
MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
return SubGPR;
};
// GFX10+ will always allocate 128 SGPRs and this field must be 0
if (STM.getGeneration() >= AMDGPUSubtarget::GFX10) {
ProgInfo.SGPRBlocks = CreateExpr(0ul);
} else {
ProgInfo.SGPRBlocks = GetNumGPRBlocks(
ProgInfo.NumSGPRsForWavesPerEU, IsaInfo::getSGPREncodingGranule(&STM));
}
ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
IsaInfo::getVGPREncodingGranule(&STM));
const SIModeRegisterDefaults Mode = MFI->getMode();
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
// register.
ProgInfo.FloatMode = getFPMode(Mode);
ProgInfo.IEEEMode = Mode.IEEE;
// Make clamp modifier on NaN input returns 0.
ProgInfo.DX10Clamp = Mode.DX10Clamp;
unsigned LDSAlignShift = 8;
switch (getLdsDwGranularity(STM)) {
case 512:
case 320:
LDSAlignShift = 11;
break;
case 128:
LDSAlignShift = 9;
break;
case 64:
LDSAlignShift = 8;
break;
default:
llvm_unreachable("invald LDS block size");
}
ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
ProgInfo.LDSSize = MFI->getLDSSize();
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
// The MCExpr equivalent of divideCeil.
auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
const MCExpr *Ceil =
AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
};
// Scratch is allocated in 64-dword or 256-dword blocks.
unsigned ScratchAlignShift =
STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
// We need to program the hardware with the amount of scratch memory that
// is used by the entire wave. ProgInfo.ScratchSize is the amount of
// scratch memory used per thread.
ProgInfo.ScratchBlocks = DivideCeil(
MCBinaryExpr::createMul(ProgInfo.ScratchSize,
CreateExpr(STM.getWavefrontSize()), Ctx),
CreateExpr(1ULL << ScratchAlignShift));
if (STM.supportsWGP()) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
}
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.MemOrdered = 1;
ProgInfo.FwdProgress = !F.hasFnAttribute("amdgpu-no-fwd-progress");
}
// 0 = X, 1 = XY, 2 = XYZ
unsigned TIDIGCompCnt = 0;
if (MFI->hasWorkItemIDZ())
TIDIGCompCnt = 2;
else if (MFI->hasWorkItemIDY())
TIDIGCompCnt = 1;
// The private segment wave byte offset is the last of the system SGPRs. We
// initially assumed it was allocated, and may have used it. It shouldn't harm
// anything to disable it if we know the stack isn't used here. We may still
// have emitted code reading it to initialize scratch, but if that's unused
// reading garbage should be OK.
ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
MCBinaryExpr::createGT(ProgInfo.ScratchBlocks,
MCConstantExpr::create(0, Ctx), Ctx),
ProgInfo.DynamicCallStack, Ctx);
ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
ProgInfo.TIdIGCompCount = TIDIGCompCnt;
ProgInfo.EXCPEnMSB = 0;
// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
ProgInfo.EXCPEnable = 0;
// return ((Dst & ~Mask) | (Value << Shift))
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
uint32_t Shift) {
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
Ctx);
return Dst;
};
if (STM.hasGFX90AInsts()) {
ProgInfo.ComputePGMRSrc3 =
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
ProgInfo.ComputePGMRSrc3 =
SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
if (STM.hasGFX1250Insts())
ProgInfo.ComputePGMRSrc3 =
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
ProgInfo.Occupancy = createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
MFI->getDynamicVGPRBlockSize(), STM, Ctx);
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
uint64_t Occupancy;
if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
DiagnosticInfoOptimizationFailure Diag(
F, F.getSubprogram(),
"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
"'" +
F.getName() + "': desired occupancy was " + Twine(MinWEU) +
", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
}
if (isGFX11Plus(STM)) {
uint32_t CodeSizeInBytes = (uint32_t)std::min(
ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
(uint64_t)std::numeric_limits<uint32_t>::max());
uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
uint32_t Field, Shift, Width;
if (isGFX11(STM)) {
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
} else {
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
}
uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
CreateExpr(InstPrefSize), Field, Shift);
}
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
switch (CallConv) {
default: [[fallthrough]];
case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
}
}
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
MCContext &Ctx = MF.getContext();
// (((Value) & Mask) << Shift)
auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx),
shft, Ctx);
};
auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
int64_t Val;
if (Value->evaluateAsAbsolute(Val))
OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
else
OutStreamer->emitValue(Value, Size);
};
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
/*Size=*/4);
OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
// Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
// appropriate generation.
if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x3FFFF, /*Shift=*/12),
/*Size=*/4);
} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x7FFF, /*Shift=*/12),
/*Size=*/4);
} else {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x1FFF, /*Shift=*/12),
/*Size=*/4);
}
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
// 0" comment but I don't see a corresponding field in the register spec.
} else {
OutStreamer->emitInt32(RsrcReg);
const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
MF.getContext());
EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
// Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
// appropriate generation.
if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x3FFFF, /*Shift=*/12),
/*Size=*/4);
} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x7FFF, /*Shift=*/12),
/*Size=*/4);
} else {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x1FFF, /*Shift=*/12),
/*Size=*/4);
}
}
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
: CurrentProgramInfo.LDSBlocks;
OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
OutStreamer->emitInt32(MFI->getPSInputEnable());
OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
OutStreamer->emitInt32(MFI->getPSInputAddr());
}
OutStreamer->emitInt32(R_SPILLED_SGPRS);
OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
OutStreamer->emitInt32(R_SPILLED_VGPRS);
OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
}
// Helper function to add common PAL Metadata 3.0+
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
const SIProgramInfo &CurrentProgramInfo,
CallingConv::ID CC, const GCNSubtarget &ST,
unsigned DynamicVGPRBlockSize) {
if (ST.hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
if (AMDGPU::isCompute(CC)) {
MD->setHwStage(CC, ".trap_present",
(bool)CurrentProgramInfo.TrapHandlerEnable);
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
if (DynamicVGPRBlockSize != 0)
MD->setComputeRegisters(".dynamic_vgpr_en", true);
}
MD->updateHwStageMaximum(
CC, ".lds_size",
(unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
sizeof(uint32_t)));
}
// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
// is AMDPAL. It stores each compute/SPI register setting and other PAL
// metadata items into the PALMD::Metadata, combining with any provided by the
// frontend as LLVM metadata. Once all functions are written, the PAL metadata
// is then written as a single block in the .note section.
void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto CC = MF.getFunction().getCallingConv();
auto *MD = getTargetStreamer()->getPALMetadata();
auto &Ctx = MF.getContext();
MD->setEntryPoint(CC, MF.getFunction().getName());
MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
// For targets that support dynamic VGPRs, set the number of saved dynamic
// VGPRs (if any) in the PAL metadata.
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (MFI->isDynamicVGPREnabled() &&
MFI->getScratchReservedForDynamicVGPRs() > 0)
MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
MFI->getScratchReservedForDynamicVGPRs() / 4);
// Only set AGPRs for supported devices
if (STM.hasMAIInsts()) {
MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
}
MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
if (MD->getPALMajorVersion() < 3) {
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
if (AMDGPU::isCompute(CC)) {
MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
} else {
const MCExpr *HasScratchBlocks =
MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
MCConstantExpr::create(0, Ctx), Ctx);
auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
}
} else {
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
CurrentProgramInfo.ScratchEnable);
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
MFI->getDynamicVGPRBlockSize());
}
// ScratchSize is in bytes, 16 aligned.
MD->setScratchSize(
CC,
AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
MCConstantExpr::create(16, Ctx), Ctx),
Ctx);
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
: CurrentProgramInfo.LDSBlocks;
if (MD->getPALMajorVersion() < 3) {
MD->setRsrc2(
CC,
MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
Ctx);
MD->setSpiPsInputEna(MFI->getPSInputEnable());
MD->setSpiPsInputAddr(MFI->getPSInputAddr());
} else {
// Graphics registers
const unsigned ExtraLdsDwGranularity =
STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
MD->setGraphicsRegisters(
".ps_extra_lds_size",
(unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
// Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
static StringLiteral const PsInputFields[] = {
".persp_sample_ena", ".persp_center_ena",
".persp_centroid_ena", ".persp_pull_model_ena",
".linear_sample_ena", ".linear_center_ena",
".linear_centroid_ena", ".line_stipple_tex_ena",
".pos_x_float_ena", ".pos_y_float_ena",
".pos_z_float_ena", ".pos_w_float_ena",
".front_face_ena", ".ancillary_ena",
".sample_coverage_ena", ".pos_fixed_pt_ena"};
unsigned PSInputEna = MFI->getPSInputEnable();
unsigned PSInputAddr = MFI->getPSInputAddr();
for (auto [Idx, Field] : enumerate(PsInputFields)) {
MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
(bool)((PSInputEna >> Idx) & 1));
MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
(bool)((PSInputAddr >> Idx) & 1));
}
}
}
// For version 3 and above the wave front size is already set in the metadata
if (MD->getPALMajorVersion() < 3 && STM.isWave32())
MD->setWave32(MF.getFunction().getCallingConv());
}
void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
auto *MD = getTargetStreamer()->getPALMetadata();
const MachineFrameInfo &MFI = MF.getFrameInfo();
StringRef FnName = MF.getFunction().getName();
MD->setFunctionScratchSize(FnName, MFI.getStackSize());
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
MCContext &Ctx = MF.getContext();
if (MD->getPALMajorVersion() < 3) {
// Set compute registers
MD->setRsrc1(
CallingConv::AMDGPU_CS,
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
MD->setRsrc2(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
} else {
EmitPALMetadataCommon(
MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
}
// Set optional info
MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
}
// This is supposed to be log2(Size)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
switch (Size) {
case 4:
return AMD_ELEMENT_4_BYTES;
case 8:
return AMD_ELEMENT_8_BYTES;
case 16:
return AMD_ELEMENT_16_BYTES;
default:
llvm_unreachable("invalid private_element_size");
}
}
void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
const SIProgramInfo &CurrentProgramInfo,
const MachineFunction &MF) const {
const Function &F = MF.getFunction();
assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL);
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
MCContext &Ctx = MF.getContext();
Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
Out.compute_pgm_resource1_registers =
CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
Out.compute_pgm_resource2_registers =
CurrentProgramInfo.getComputePGMRSrc2(Ctx);
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
}
if (UserSGPRInfo.hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
if (UserSGPRInfo.hasQueuePtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
if (UserSGPRInfo.hasKernargSegmentPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
if (UserSGPRInfo.hasDispatchID())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
if (UserSGPRInfo.hasFlatScratchInit())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
if (UserSGPRInfo.hasPrivateSegmentSize())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
if (STM.isXNACKEnabled())
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
Align MaxKernArgAlign;
Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
// kernarg_segment_alignment is specified as log of the alignment.
// The minimum alignment is 16.
// FIXME: The metadata treats the minimum as 4?
Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
}
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode, raw_ostream &O) {
// First try the generic code, which knows about modifiers like 'c' and 'n'.
if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
return false;
if (ExtraCode && ExtraCode[0]) {
if (ExtraCode[1] != 0)
return true; // Unknown modifier.
switch (ExtraCode[0]) {
case 'r':
break;
default:
return true;
}
}
// TODO: Should be able to support other operand types like globals.
const MachineOperand &MO = MI->getOperand(OpNo);
if (MO.isReg()) {
AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
*MF->getSubtarget().getRegisterInfo());
return false;
}
if (MO.isImm()) {
int64_t Val = MO.getImm();
if (AMDGPU::isInlinableIntLiteral(Val)) {
O << Val;
} else if (isUInt<16>(Val)) {
O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
} else if (isUInt<32>(Val)) {
O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
} else {
O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
}
return false;
}
return true;
}
void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<AMDGPUResourceUsageAnalysisWrapperPass>();
AU.addPreserved<AMDGPUResourceUsageAnalysisWrapperPass>();
AU.addRequired<MachineModuleInfoWrapperPass>();
AU.addPreserved<MachineModuleInfoWrapperPass>();
AsmPrinter::getAnalysisUsage(AU);
}
void AMDGPUAsmPrinter::emitResourceUsageRemarks(
const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
bool isModuleEntryFunction, bool hasMAIInsts) {
if (!ORE)
return;
const char *Name = "kernel-resource-usage";
const char *Indent = " ";
// If the remark is not specifically enabled, do not output to yaml
LLVMContext &Ctx = MF.getFunction().getContext();
if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
return;
// Currently non-kernel functions have no resources to emit.
if (!isEntryFunctionCC(MF.getFunction().getCallingConv()))
return;
auto EmitResourceUsageRemark = [&](StringRef RemarkName,
StringRef RemarkLabel, auto Argument) {
// Add an indent for every line besides the line with the kernel name. This
// makes it easier to tell which resource usage go with which kernel since
// the kernel name will always be displayed first.
std::string LabelStr = RemarkLabel.str() + ": ";
if (RemarkName != "FunctionName")
LabelStr = Indent + LabelStr;
ORE->emit([&]() {
return MachineOptimizationRemarkAnalysis(Name, RemarkName,
MF.getFunction().getSubprogram(),
&MF.front())
<< LabelStr << ore::NV(RemarkName, Argument);
});
};
// FIXME: Formatting here is pretty nasty because clang does not accept
// newlines from diagnostics. This forces us to emit multiple diagnostic
// remarks to simulate newlines. If and when clang does accept newlines, this
// formatting should be aggregated into one remark with newlines to avoid
// printing multiple diagnostic location and diag opts.
EmitResourceUsageRemark("FunctionName", "Function Name",
MF.getFunction().getName());
EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
getMCExprStr(CurrentProgramInfo.NumSGPR));
EmitResourceUsageRemark("NumVGPR", "VGPRs",
getMCExprStr(CurrentProgramInfo.NumArchVGPR));
if (hasMAIInsts) {
EmitResourceUsageRemark("NumAGPR", "AGPRs",
getMCExprStr(CurrentProgramInfo.NumAccVGPR));
}
EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
getMCExprStr(CurrentProgramInfo.ScratchSize));
int64_t DynStack;
bool DynStackEvaluatable =
CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
StringRef DynamicStackStr =
DynStackEvaluatable && DynStack ? "True" : "False";
EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
getMCExprStr(CurrentProgramInfo.Occupancy));
EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
CurrentProgramInfo.SGPRSpill);
EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
CurrentProgramInfo.VGPRSpill);
if (isModuleEntryFunction)
EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
CurrentProgramInfo.LDSSize);
}
char AMDGPUAsmPrinter::ID = 0;
INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
"AMDGPU Assembly Printer", false, false)