
The CWSR trap handler needs to save and restore the VGPRs. When dynamic VGPRs are in use, the fixed function hardware will only allocate enough space for one VGPR block. The rest will have to be stored in scratch, at offset 0. This patch allocates the necessary space by: - generating a prologue that checks at runtime if we're on a compute queue (since CWSR only works on compute queues); for this we will have to check the ME_ID bits of the ID_HW_ID2 register - if that is non-zero, we can assume we're on a compute queue and initialize the SP and FP with enough room for the dynamic VGPRs - forcing all compute entry functions to use a FP so they can access their locals/spills correctly (this isn't ideal but it's the quickest to implement) Note that at the moment we allocate enough space for the theoretical maximum number of VGPRs that can be allocated dynamically (for blocks of 16 registers, this will be 128, of which we subtract the first 16, which are already allocated by the fixed function hardware). Future patches may decide to allocate less if they can prove the shader never allocates that many blocks. Also note that this should not affect any reported stack sizes (e.g. PAL backend_stack_size etc).
1746 lines
70 KiB
C++
1746 lines
70 KiB
C++
//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
///
|
|
/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
|
|
/// code. When passed an MCAsmStreamer it prints assembly and when passed
|
|
/// an MCObjectStreamer it outputs binary code.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
|
|
#include "AMDGPUAsmPrinter.h"
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUHSAMetadataStreamer.h"
|
|
#include "AMDGPUMCResourceInfo.h"
|
|
#include "AMDGPUResourceUsageAnalysis.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "MCTargetDesc/AMDGPUInstPrinter.h"
|
|
#include "MCTargetDesc/AMDGPUMCExpr.h"
|
|
#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
|
|
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
|
|
#include "R600AsmPrinter.h"
|
|
#include "SIMachineFunctionInfo.h"
|
|
#include "TargetInfo/AMDGPUTargetInfo.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "Utils/AMDKernelCodeTUtils.h"
|
|
#include "Utils/SIDefinesUtils.h"
|
|
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
|
|
#include "llvm/BinaryFormat/ELF.h"
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
#include "llvm/CodeGen/MachineModuleInfo.h"
|
|
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
|
|
#include "llvm/IR/DiagnosticInfo.h"
|
|
#include "llvm/MC/MCAssembler.h"
|
|
#include "llvm/MC/MCContext.h"
|
|
#include "llvm/MC/MCSectionELF.h"
|
|
#include "llvm/MC/MCStreamer.h"
|
|
#include "llvm/MC/TargetRegistry.h"
|
|
#include "llvm/Support/AMDHSAKernelDescriptor.h"
|
|
#include "llvm/Target/TargetLoweringObjectFile.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
#include "llvm/TargetParser/TargetParser.h"
|
|
|
|
using namespace llvm;
|
|
using namespace llvm::AMDGPU;
|
|
|
|
// This should get the default rounding mode from the kernel. We just set the
|
|
// default here, but this could change if the OpenCL rounding mode pragmas are
|
|
// used.
|
|
//
|
|
// The denormal mode here should match what is reported by the OpenCL runtime
|
|
// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
|
|
// can also be override to flush with the -cl-denorms-are-zero compiler flag.
|
|
//
|
|
// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
|
|
// precision, and leaves single precision to flush all and does not report
|
|
// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
|
|
// CL_FP_DENORM for both.
|
|
//
|
|
// FIXME: It seems some instructions do not support single precision denormals
|
|
// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
|
|
// and sin_f32, cos_f32 on most parts).
|
|
|
|
// We want to use these instructions, and using fp32 denormals also causes
|
|
// instructions to run at the double precision rate for the device so it's
|
|
// probably best to just report no single precision denormals.
|
|
static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
|
|
return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
|
|
FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
|
|
FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
|
|
FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
|
|
}
|
|
|
|
static AsmPrinter *
|
|
createAMDGPUAsmPrinterPass(TargetMachine &tm,
|
|
std::unique_ptr<MCStreamer> &&Streamer) {
|
|
return new AMDGPUAsmPrinter(tm, std::move(Streamer));
|
|
}
|
|
|
|
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
|
|
TargetRegistry::RegisterAsmPrinter(getTheR600Target(),
|
|
llvm::createR600AsmPrinterPass);
|
|
TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
|
|
createAMDGPUAsmPrinterPass);
|
|
}
|
|
|
|
AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
|
|
std::unique_ptr<MCStreamer> Streamer)
|
|
: AsmPrinter(TM, std::move(Streamer)) {
|
|
assert(OutStreamer && "AsmPrinter constructed without streamer");
|
|
}
|
|
|
|
StringRef AMDGPUAsmPrinter::getPassName() const {
|
|
return "AMDGPU Assembly Printer";
|
|
}
|
|
|
|
const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const {
|
|
return TM.getMCSubtargetInfo();
|
|
}
|
|
|
|
AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
|
|
if (!OutStreamer)
|
|
return nullptr;
|
|
return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
|
|
IsTargetStreamerInitialized = false;
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
|
|
IsTargetStreamerInitialized = true;
|
|
|
|
// TODO: Which one is called first, emitStartOfAsmFile or
|
|
// emitFunctionBodyStart?
|
|
if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
|
|
initializeTargetID(M);
|
|
|
|
if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
|
|
TM.getTargetTriple().getOS() != Triple::AMDPAL)
|
|
return;
|
|
|
|
getTargetStreamer()->EmitDirectiveAMDGCNTarget();
|
|
|
|
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
|
|
getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
|
|
CodeObjectVersion);
|
|
HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
|
|
}
|
|
|
|
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
|
|
getTargetStreamer()->getPALMetadata()->readFromIR(M);
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
|
|
// Init target streamer if it has not yet happened
|
|
if (!IsTargetStreamerInitialized)
|
|
initTargetStreamer(M);
|
|
|
|
if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
|
|
getTargetStreamer()->EmitISAVersion();
|
|
|
|
// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
|
|
// Emit HSA Metadata (NT_AMD_HSA_METADATA).
|
|
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
|
|
HSAMetadataStream->end();
|
|
bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
|
|
(void)Success;
|
|
assert(Success && "Malformed HSA Metadata");
|
|
}
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::emitFunctionBodyStart() {
|
|
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
|
|
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
|
|
const Function &F = MF->getFunction();
|
|
|
|
// TODO: We're checking this late, would be nice to check it earlier.
|
|
if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
|
|
report_fatal_error(
|
|
STM.getCPU() + " is only available on code object version 6 or better",
|
|
/*gen_crash_diag*/ false);
|
|
}
|
|
|
|
// TODO: Which one is called first, emitStartOfAsmFile or
|
|
// emitFunctionBodyStart?
|
|
if (!getTargetStreamer()->getTargetID())
|
|
initializeTargetID(*F.getParent());
|
|
|
|
const auto &FunctionTargetID = STM.getTargetID();
|
|
// Make sure function's xnack settings are compatible with module's
|
|
// xnack settings.
|
|
if (FunctionTargetID.isXnackSupported() &&
|
|
FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
|
|
FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
|
|
OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
|
|
"' function does not match module xnack setting");
|
|
return;
|
|
}
|
|
// Make sure function's sramecc settings are compatible with module's
|
|
// sramecc settings.
|
|
if (FunctionTargetID.isSramEccSupported() &&
|
|
FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
|
|
FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
|
|
OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
|
|
"' function does not match module sramecc setting");
|
|
return;
|
|
}
|
|
|
|
if (!MFI.isEntryFunction())
|
|
return;
|
|
|
|
if (STM.isMesaKernel(F) &&
|
|
(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
|
|
F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
|
|
AMDGPUMCKernelCodeT KernelCode;
|
|
getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
|
|
KernelCode.validate(&STM, MF->getContext());
|
|
getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
|
|
}
|
|
|
|
if (STM.isAmdHsaOS())
|
|
HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
|
|
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
|
|
if (!MFI.isEntryFunction())
|
|
return;
|
|
|
|
if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
|
|
return;
|
|
|
|
auto &Streamer = getTargetStreamer()->getStreamer();
|
|
auto &Context = Streamer.getContext();
|
|
auto &ObjectFileInfo = *Context.getObjectFileInfo();
|
|
auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
|
|
|
|
Streamer.pushSection();
|
|
Streamer.switchSection(&ReadOnlySection);
|
|
|
|
// CP microcode requires the kernel descriptor to be allocated on 64 byte
|
|
// alignment.
|
|
Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
|
|
ReadOnlySection.ensureMinAlignment(Align(64));
|
|
|
|
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
|
|
|
|
SmallString<128> KernelName;
|
|
getNameWithPrefix(KernelName, &MF->getFunction());
|
|
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
|
|
STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
|
|
CurrentProgramInfo.NumVGPRsForWavesPerEU,
|
|
MCBinaryExpr::createSub(
|
|
CurrentProgramInfo.NumSGPRsForWavesPerEU,
|
|
AMDGPUMCExpr::createExtraSGPRs(
|
|
CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
|
|
getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
|
|
Context),
|
|
CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
|
|
|
|
Streamer.popSection();
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
|
|
Register RegNo = MI->getOperand(0).getReg();
|
|
|
|
SmallString<128> Str;
|
|
raw_svector_ostream OS(Str);
|
|
OS << "implicit-def: "
|
|
<< printReg(RegNo, MF->getSubtarget().getRegisterInfo());
|
|
|
|
if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
|
|
OS << " : SGPR spill to VGPR lane";
|
|
|
|
OutStreamer->AddComment(OS.str());
|
|
OutStreamer->addBlankLine();
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
|
|
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
|
|
AsmPrinter::emitFunctionEntryLabel();
|
|
return;
|
|
}
|
|
|
|
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
|
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
|
|
if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
|
|
SmallString<128> SymbolName;
|
|
getNameWithPrefix(SymbolName, &MF->getFunction()),
|
|
getTargetStreamer()->EmitAMDGPUSymbolType(
|
|
SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
|
|
}
|
|
if (DumpCodeInstEmitter) {
|
|
// Disassemble function name label to text.
|
|
DisasmLines.push_back(MF->getName().str() + ":");
|
|
DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
|
|
HexLines.emplace_back("");
|
|
}
|
|
|
|
AsmPrinter::emitFunctionEntryLabel();
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
|
|
if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
|
|
// Write a line for the basic block label if it is not only fallthrough.
|
|
DisasmLines.push_back(
|
|
(Twine("BB") + Twine(getFunctionNumber())
|
|
+ "_" + Twine(MBB.getNumber()) + ":").str());
|
|
DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
|
|
HexLines.emplace_back("");
|
|
}
|
|
AsmPrinter::emitBasicBlockStart(MBB);
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
|
|
if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
|
|
if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
|
|
OutContext.reportError({},
|
|
Twine(GV->getName()) +
|
|
": unsupported initializer for address space");
|
|
return;
|
|
}
|
|
|
|
// LDS variables aren't emitted in HSA or PAL yet.
|
|
const Triple::OSType OS = TM.getTargetTriple().getOS();
|
|
if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
|
|
return;
|
|
|
|
MCSymbol *GVSym = getSymbol(GV);
|
|
|
|
GVSym->redefineIfPossible();
|
|
if (GVSym->isDefined() || GVSym->isVariable())
|
|
report_fatal_error("symbol '" + Twine(GVSym->getName()) +
|
|
"' is already defined");
|
|
|
|
const DataLayout &DL = GV->getDataLayout();
|
|
uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
|
|
Align Alignment = GV->getAlign().value_or(Align(4));
|
|
|
|
emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
|
|
emitLinkage(GV, GVSym);
|
|
auto *TS = getTargetStreamer();
|
|
TS->emitAMDGPULDS(GVSym, Size, Alignment);
|
|
return;
|
|
}
|
|
|
|
AsmPrinter::emitGlobalVariable(GV);
|
|
}
|
|
|
|
bool AMDGPUAsmPrinter::doInitialization(Module &M) {
|
|
CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
|
|
|
|
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
|
|
switch (CodeObjectVersion) {
|
|
case AMDGPU::AMDHSA_COV4:
|
|
HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
|
|
break;
|
|
case AMDGPU::AMDHSA_COV5:
|
|
HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
|
|
break;
|
|
case AMDGPU::AMDHSA_COV6:
|
|
HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
|
|
break;
|
|
default:
|
|
report_fatal_error("Unexpected code object version");
|
|
}
|
|
}
|
|
|
|
return AsmPrinter::doInitialization(M);
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
|
|
if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
|
|
return;
|
|
|
|
using RIK = MCResourceInfo::ResourceInfoKind;
|
|
const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
|
|
MCSymbol *FnSym = TM.getSymbol(&F);
|
|
bool IsLocal = F.hasLocalLinkage();
|
|
|
|
auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
|
|
int64_t Val;
|
|
if (Value->evaluateAsAbsolute(Val)) {
|
|
Res = Val;
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
const uint64_t MaxScratchPerWorkitem =
|
|
STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
|
|
MCSymbol *ScratchSizeSymbol = RI.getSymbol(
|
|
FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal);
|
|
uint64_t ScratchSize;
|
|
if (ScratchSizeSymbol->isVariable() &&
|
|
TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
|
|
ScratchSize > MaxScratchPerWorkitem) {
|
|
DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
|
|
DS_Error);
|
|
F.getContext().diagnose(DiagStackSize);
|
|
}
|
|
|
|
// Validate addressable scalar registers (i.e., prior to added implicit
|
|
// SGPRs).
|
|
MCSymbol *NumSGPRSymbol =
|
|
RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal);
|
|
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
|
|
!STM.hasSGPRInitBug()) {
|
|
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
|
|
uint64_t NumSgpr;
|
|
if (NumSGPRSymbol->isVariable() &&
|
|
TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
|
|
NumSgpr > MaxAddressableNumSGPRs) {
|
|
DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
|
|
NumSgpr, MaxAddressableNumSGPRs,
|
|
DS_Error, DK_ResourceLimit);
|
|
F.getContext().diagnose(Diag);
|
|
return;
|
|
}
|
|
}
|
|
|
|
MCSymbol *VCCUsedSymbol =
|
|
RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal);
|
|
MCSymbol *FlatUsedSymbol = RI.getSymbol(
|
|
FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
|
|
uint64_t VCCUsed, FlatUsed, NumSgpr;
|
|
|
|
if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
|
|
FlatUsedSymbol->isVariable() &&
|
|
TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
|
|
TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
|
|
TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
|
|
|
|
// Recomputes NumSgprs + implicit SGPRs but all symbols should now be
|
|
// resolvable.
|
|
NumSgpr += IsaInfo::getNumExtraSGPRs(
|
|
&STM, VCCUsed, FlatUsed,
|
|
getTargetStreamer()->getTargetID()->isXnackOnOrAny());
|
|
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
|
|
STM.hasSGPRInitBug()) {
|
|
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
|
|
if (NumSgpr > MaxAddressableNumSGPRs) {
|
|
DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
|
|
MaxAddressableNumSGPRs, DS_Error,
|
|
DK_ResourceLimit);
|
|
F.getContext().diagnose(Diag);
|
|
return;
|
|
}
|
|
}
|
|
|
|
MCSymbol *NumVgprSymbol =
|
|
RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal);
|
|
MCSymbol *NumAgprSymbol =
|
|
RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal);
|
|
uint64_t NumVgpr, NumAgpr;
|
|
|
|
MachineModuleInfo &MMI =
|
|
getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
|
|
MachineFunction *MF = MMI.getMachineFunction(F);
|
|
if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
|
|
TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
|
|
TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
|
|
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
|
|
unsigned MaxWaves = MFI.getMaxWavesPerEU();
|
|
uint64_t TotalNumVgpr =
|
|
getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
|
|
uint64_t NumVGPRsForWavesPerEU = std::max(
|
|
{TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
|
|
uint64_t NumSGPRsForWavesPerEU = std::max(
|
|
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
|
|
const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
|
|
STM.getOccupancyWithWorkGroupSizes(*MF).second,
|
|
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
|
|
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
|
|
OutContext);
|
|
uint64_t Occupancy;
|
|
|
|
const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
|
|
F, "amdgpu-waves-per-eu", {0, 0}, true);
|
|
|
|
if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
|
|
DiagnosticInfoOptimizationFailure Diag(
|
|
F, F.getSubprogram(),
|
|
"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
|
|
"'" +
|
|
F.getName() + "': desired occupancy was " + Twine(MinWEU) +
|
|
", final occupancy is " + Twine(Occupancy));
|
|
F.getContext().diagnose(Diag);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
|
|
// Pad with s_code_end to help tools and guard against instruction prefetch
|
|
// causing stale data in caches. Arguably this should be done by the linker,
|
|
// which is why this isn't done for Mesa.
|
|
const MCSubtargetInfo &STI = *getGlobalSTI();
|
|
if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
|
|
(STI.getTargetTriple().getOS() == Triple::AMDHSA ||
|
|
STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
|
|
OutStreamer->switchSection(getObjFileLowering().getTextSection());
|
|
getTargetStreamer()->EmitCodeEnd(STI);
|
|
}
|
|
|
|
// Assign expressions which can only be resolved when all other functions are
|
|
// known.
|
|
RI.finalize(OutContext);
|
|
|
|
// Switch section and emit all GPR maximums within the processed module.
|
|
OutStreamer->pushSection();
|
|
MCSectionELF *MaxGPRSection =
|
|
OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
|
|
OutStreamer->switchSection(MaxGPRSection);
|
|
getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext),
|
|
RI.getMaxAGPRSymbol(OutContext),
|
|
RI.getMaxSGPRSymbol(OutContext));
|
|
OutStreamer->popSection();
|
|
|
|
for (Function &F : M.functions())
|
|
validateMCResourceInfo(F);
|
|
|
|
RI.reset();
|
|
|
|
return AsmPrinter::doFinalization(M);
|
|
}
|
|
|
|
SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
|
|
SmallString<128> Str;
|
|
raw_svector_ostream OSS(Str);
|
|
auto &Streamer = getTargetStreamer()->getStreamer();
|
|
auto &Context = Streamer.getContext();
|
|
const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
|
|
printAMDGPUMCExpr(New, OSS, MAI);
|
|
return Str;
|
|
}
|
|
|
|
// Print comments that apply to both callable functions and entry points.
|
|
void AMDGPUAsmPrinter::emitCommonFunctionComments(
|
|
const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
|
|
const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
|
|
const AMDGPUMachineFunction *MFI) {
|
|
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
|
|
OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
|
|
false);
|
|
OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
|
|
if (NumAGPR && TotalNumVGPR) {
|
|
OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
|
|
OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
|
|
false);
|
|
}
|
|
OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
|
|
false);
|
|
OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
|
|
false);
|
|
}
|
|
|
|
const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
|
|
const MachineFunction &MF) const {
|
|
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
|
|
MCContext &Ctx = MF.getContext();
|
|
uint16_t KernelCodeProperties = 0;
|
|
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
|
|
|
|
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
|
|
KernelCodeProperties |=
|
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
|
|
}
|
|
if (UserSGPRInfo.hasDispatchPtr()) {
|
|
KernelCodeProperties |=
|
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
|
|
}
|
|
if (UserSGPRInfo.hasQueuePtr()) {
|
|
KernelCodeProperties |=
|
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
|
|
}
|
|
if (UserSGPRInfo.hasKernargSegmentPtr()) {
|
|
KernelCodeProperties |=
|
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
|
|
}
|
|
if (UserSGPRInfo.hasDispatchID()) {
|
|
KernelCodeProperties |=
|
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
|
|
}
|
|
if (UserSGPRInfo.hasFlatScratchInit()) {
|
|
KernelCodeProperties |=
|
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
|
|
}
|
|
if (UserSGPRInfo.hasPrivateSegmentSize()) {
|
|
KernelCodeProperties |=
|
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
|
|
}
|
|
if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
|
|
KernelCodeProperties |=
|
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
|
|
}
|
|
|
|
// CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
|
|
// un-evaluatable at this point so it cannot be conditionally checked here.
|
|
// Instead, we'll directly shift the possibly unknown MCExpr into its place
|
|
// and bitwise-or it into KernelCodeProperties.
|
|
const MCExpr *KernelCodePropExpr =
|
|
MCConstantExpr::create(KernelCodeProperties, Ctx);
|
|
const MCExpr *OrValue = MCConstantExpr::create(
|
|
amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
|
|
OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
|
|
OrValue, Ctx);
|
|
KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
|
|
|
|
return KernelCodePropExpr;
|
|
}
|
|
|
|
MCKernelDescriptor
|
|
AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
|
|
const SIProgramInfo &PI) const {
|
|
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
|
|
const Function &F = MF.getFunction();
|
|
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
|
MCContext &Ctx = MF.getContext();
|
|
|
|
MCKernelDescriptor KernelDescriptor;
|
|
|
|
KernelDescriptor.group_segment_fixed_size =
|
|
MCConstantExpr::create(PI.LDSSize, Ctx);
|
|
KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
|
|
|
|
Align MaxKernArgAlign;
|
|
KernelDescriptor.kernarg_size = MCConstantExpr::create(
|
|
STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
|
|
|
|
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
|
|
KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
|
|
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
|
|
|
|
int64_t PGRM_Rsrc3 = 1;
|
|
bool EvaluatableRsrc3 =
|
|
CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
|
|
(void)PGRM_Rsrc3;
|
|
(void)EvaluatableRsrc3;
|
|
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
|
|
STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
|
|
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
|
|
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
|
|
|
|
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
|
|
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
|
|
Ctx);
|
|
|
|
return KernelDescriptor;
|
|
}
|
|
|
|
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
|
|
// Init target streamer lazily on the first function so that previous passes
|
|
// can set metadata.
|
|
if (!IsTargetStreamerInitialized)
|
|
initTargetStreamer(*MF.getFunction().getParent());
|
|
|
|
ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
|
|
CurrentProgramInfo.reset(MF);
|
|
|
|
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
|
|
MCContext &Ctx = MF.getContext();
|
|
|
|
// The starting address of all shader programs must be 256 bytes aligned.
|
|
// Regular functions just need the basic required instruction alignment.
|
|
MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
|
|
|
|
SetupMachineFunction(MF);
|
|
|
|
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
|
|
MCContext &Context = getObjFileLowering().getContext();
|
|
bool IsLocal = MF.getFunction().hasLocalLinkage();
|
|
// FIXME: This should be an explicit check for Mesa.
|
|
if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
|
|
MCSectionELF *ConfigSection =
|
|
Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
|
|
OutStreamer->switchSection(ConfigSection);
|
|
}
|
|
|
|
const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
|
|
ResourceUsage->getResourceInfo();
|
|
RI.gatherResourceInfo(MF, Info, OutContext);
|
|
|
|
if (MFI->isModuleEntryFunction()) {
|
|
getSIProgramInfo(CurrentProgramInfo, MF);
|
|
}
|
|
|
|
if (STM.isAmdPalOS()) {
|
|
if (MFI->isEntryFunction())
|
|
EmitPALMetadata(MF, CurrentProgramInfo);
|
|
else if (MFI->isModuleEntryFunction())
|
|
emitPALFunctionMetadata(MF);
|
|
} else if (!STM.isAmdHsaOS()) {
|
|
EmitProgramInfoSI(MF, CurrentProgramInfo);
|
|
}
|
|
|
|
DumpCodeInstEmitter = nullptr;
|
|
if (STM.dumpCode()) {
|
|
// For -dumpcode, get the assembler out of the streamer. This only works
|
|
// with -filetype=obj.
|
|
MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
|
|
if (Assembler)
|
|
DumpCodeInstEmitter = Assembler->getEmitterPtr();
|
|
}
|
|
|
|
DisasmLines.clear();
|
|
HexLines.clear();
|
|
DisasmLineMaxLen = 0;
|
|
|
|
emitFunctionBody();
|
|
|
|
emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
|
|
STM.hasMAIInsts());
|
|
|
|
{
|
|
using RIK = MCResourceInfo::ResourceInfoKind;
|
|
getTargetStreamer()->EmitMCResourceInfo(
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
|
|
IsLocal),
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext,
|
|
IsLocal),
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
|
|
IsLocal),
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
|
|
OutContext, IsLocal),
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
|
|
IsLocal),
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
|
|
OutContext, IsLocal),
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
|
|
OutContext, IsLocal),
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext,
|
|
IsLocal),
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
|
|
OutContext, IsLocal));
|
|
}
|
|
|
|
if (isVerbose()) {
|
|
MCSectionELF *CommentSection =
|
|
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
|
|
OutStreamer->switchSection(CommentSection);
|
|
|
|
if (!MFI->isEntryFunction()) {
|
|
using RIK = MCResourceInfo::ResourceInfoKind;
|
|
OutStreamer->emitRawComment(" Function info:", false);
|
|
|
|
emitCommonFunctionComments(
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
|
|
IsLocal)
|
|
->getVariableValue(),
|
|
STM.hasMAIInsts()
|
|
? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR,
|
|
OutContext, IsLocal)
|
|
->getVariableValue()
|
|
: nullptr,
|
|
RI.createTotalNumVGPRs(MF, Ctx),
|
|
RI.createTotalNumSGPRs(
|
|
MF,
|
|
MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
|
|
Ctx),
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
|
|
OutContext, IsLocal)
|
|
->getVariableValue(),
|
|
CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
|
|
return false;
|
|
}
|
|
|
|
OutStreamer->emitRawComment(" Kernel info:", false);
|
|
emitCommonFunctionComments(
|
|
CurrentProgramInfo.NumArchVGPR,
|
|
STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
|
|
CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
|
|
CurrentProgramInfo.ScratchSize,
|
|
CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
|
|
|
|
OutStreamer->emitRawComment(
|
|
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
|
|
OutStreamer->emitRawComment(
|
|
" IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
|
|
OutStreamer->emitRawComment(
|
|
" LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
|
|
" bytes/workgroup (compile time only)", false);
|
|
|
|
OutStreamer->emitRawComment(
|
|
" SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
|
|
|
|
OutStreamer->emitRawComment(
|
|
" VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
|
|
|
|
OutStreamer->emitRawComment(
|
|
" NumSGPRsForWavesPerEU: " +
|
|
getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
|
|
false);
|
|
OutStreamer->emitRawComment(
|
|
" NumVGPRsForWavesPerEU: " +
|
|
getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
|
|
false);
|
|
|
|
if (STM.hasGFX90AInsts()) {
|
|
const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
|
|
CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
|
|
AdjustedAccum = MCBinaryExpr::createMul(
|
|
AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
|
|
OutStreamer->emitRawComment(
|
|
" AccumOffset: " + getMCExprStr(AdjustedAccum), false);
|
|
}
|
|
|
|
OutStreamer->emitRawComment(
|
|
" Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
|
|
|
|
OutStreamer->emitRawComment(
|
|
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
|
|
|
|
OutStreamer->emitRawComment(
|
|
" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
|
|
getMCExprStr(CurrentProgramInfo.ScratchEnable),
|
|
false);
|
|
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
|
|
Twine(CurrentProgramInfo.UserSGPR),
|
|
false);
|
|
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
|
|
Twine(CurrentProgramInfo.TrapHandlerEnable),
|
|
false);
|
|
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
|
|
Twine(CurrentProgramInfo.TGIdXEnable),
|
|
false);
|
|
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
|
|
Twine(CurrentProgramInfo.TGIdYEnable),
|
|
false);
|
|
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
|
|
Twine(CurrentProgramInfo.TGIdZEnable),
|
|
false);
|
|
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
|
|
Twine(CurrentProgramInfo.TIdIGCompCount),
|
|
false);
|
|
|
|
[[maybe_unused]] int64_t PGMRSrc3;
|
|
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
|
|
STM.hasGFX90AInsts() ||
|
|
(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
|
|
static_cast<uint64_t>(PGMRSrc3) == 0));
|
|
if (STM.hasGFX90AInsts()) {
|
|
OutStreamer->emitRawComment(
|
|
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
|
|
getMCExprStr(MCKernelDescriptor::bits_get(
|
|
CurrentProgramInfo.ComputePGMRSrc3,
|
|
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
|
|
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
|
|
false);
|
|
OutStreamer->emitRawComment(
|
|
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
|
|
getMCExprStr(MCKernelDescriptor::bits_get(
|
|
CurrentProgramInfo.ComputePGMRSrc3,
|
|
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
|
|
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
|
|
false);
|
|
}
|
|
}
|
|
|
|
if (DumpCodeInstEmitter) {
|
|
|
|
OutStreamer->switchSection(
|
|
Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
|
|
|
|
for (size_t i = 0; i < DisasmLines.size(); ++i) {
|
|
std::string Comment = "\n";
|
|
if (!HexLines[i].empty()) {
|
|
Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
|
|
Comment += " ; " + HexLines[i] + "\n";
|
|
}
|
|
|
|
OutStreamer->emitBytes(StringRef(DisasmLines[i]));
|
|
OutStreamer->emitBytes(StringRef(Comment));
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// TODO: Fold this into emitFunctionBodyStart.
|
|
void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
|
|
// In the beginning all features are either 'Any' or 'NotSupported',
|
|
// depending on global target features. This will cover empty modules.
|
|
getTargetStreamer()->initializeTargetID(*getGlobalSTI(),
|
|
getGlobalSTI()->getFeatureString());
|
|
|
|
// If module is empty, we are done.
|
|
if (M.empty())
|
|
return;
|
|
|
|
// If module is not empty, need to find first 'Off' or 'On' feature
|
|
// setting per feature from functions in module.
|
|
for (auto &F : M) {
|
|
auto &TSTargetID = getTargetStreamer()->getTargetID();
|
|
if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
|
|
(!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
|
|
break;
|
|
|
|
const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
|
|
const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
|
|
if (TSTargetID->isXnackSupported())
|
|
if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
|
|
TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
|
|
if (TSTargetID->isSramEccSupported())
|
|
if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
|
|
TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
|
|
}
|
|
}
|
|
|
|
// AccumOffset computed for the MCExpr equivalent of:
|
|
// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
|
|
static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
|
|
const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
|
|
const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
|
|
|
|
// Can't be lower than 1 for subsequent alignTo.
|
|
const MCExpr *MaximumTaken =
|
|
AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
|
|
|
|
// Practically, it's computing divideCeil(MaximumTaken, 4).
|
|
const MCExpr *DivCeil = MCBinaryExpr::createDiv(
|
|
AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
|
|
Ctx);
|
|
|
|
return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
|
const MachineFunction &MF) {
|
|
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
|
|
bool IsLocal = MF.getFunction().hasLocalLinkage();
|
|
MCContext &Ctx = MF.getContext();
|
|
|
|
auto CreateExpr = [&Ctx](int64_t Value) {
|
|
return MCConstantExpr::create(Value, Ctx);
|
|
};
|
|
|
|
auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
|
|
int64_t Val;
|
|
if (Value->evaluateAsAbsolute(Val)) {
|
|
Res = Val;
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
auto GetSymRefExpr =
|
|
[&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
|
|
MCSymbol *Sym =
|
|
RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal);
|
|
return MCSymbolRefExpr::create(Sym, Ctx);
|
|
};
|
|
|
|
using RIK = MCResourceInfo::ResourceInfoKind;
|
|
ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
|
|
ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
|
|
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
|
|
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
|
|
|
|
ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
|
|
ProgInfo.TgSplit = STM.isTgSplitEnabled();
|
|
ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
|
|
ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
|
|
ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
|
|
ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
|
|
ProgInfo.DynamicCallStack =
|
|
MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
|
|
GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
|
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
// The calculations related to SGPR/VGPR blocks are
|
|
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
|
|
// unified.
|
|
const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
|
|
ProgInfo.VCCUsed, ProgInfo.FlatUsed,
|
|
getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
|
|
|
|
// Check the addressable register limit before we add ExtraSGPRs.
|
|
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
|
|
!STM.hasSGPRInitBug()) {
|
|
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
|
|
uint64_t NumSgpr;
|
|
if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
|
|
NumSgpr > MaxAddressableNumSGPRs) {
|
|
// This can happen due to a compiler bug or when using inline asm.
|
|
LLVMContext &Ctx = MF.getFunction().getContext();
|
|
DiagnosticInfoResourceLimit Diag(
|
|
MF.getFunction(), "addressable scalar registers", NumSgpr,
|
|
MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
|
|
Ctx.diagnose(Diag);
|
|
ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
|
|
}
|
|
}
|
|
|
|
// Account for extra SGPRs and VGPRs reserved for debugger use.
|
|
ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
|
|
|
|
const Function &F = MF.getFunction();
|
|
|
|
// Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
|
|
// dispatch registers are function args.
|
|
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
|
|
|
|
if (isShader(F.getCallingConv())) {
|
|
bool IsPixelShader =
|
|
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
|
|
|
|
// Calculate the number of VGPR registers based on the SPI input registers
|
|
uint32_t InputEna = 0;
|
|
uint32_t InputAddr = 0;
|
|
unsigned LastEna = 0;
|
|
|
|
if (IsPixelShader) {
|
|
// Note for IsPixelShader:
|
|
// By this stage, all enabled inputs are tagged in InputAddr as well.
|
|
// We will use InputAddr to determine whether the input counts against the
|
|
// vgpr total and only use the InputEnable to determine the last input
|
|
// that is relevant - if extra arguments are used, then we have to honour
|
|
// the InputAddr for any intermediate non-enabled inputs.
|
|
InputEna = MFI->getPSInputEnable();
|
|
InputAddr = MFI->getPSInputAddr();
|
|
|
|
// We only need to consider input args up to the last used arg.
|
|
assert((InputEna || InputAddr) &&
|
|
"PSInputAddr and PSInputEnable should "
|
|
"never both be 0 for AMDGPU_PS shaders");
|
|
// There are some rare circumstances where InputAddr is non-zero and
|
|
// InputEna can be set to 0. In this case we default to setting LastEna
|
|
// to 1.
|
|
LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
|
|
}
|
|
|
|
// FIXME: We should be using the number of registers determined during
|
|
// calling convention lowering to legalize the types.
|
|
const DataLayout &DL = F.getDataLayout();
|
|
unsigned PSArgCount = 0;
|
|
unsigned IntermediateVGPR = 0;
|
|
for (auto &Arg : F.args()) {
|
|
unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
|
|
if (Arg.hasAttribute(Attribute::InReg)) {
|
|
WaveDispatchNumSGPR += NumRegs;
|
|
} else {
|
|
// If this is a PS shader and we're processing the PS Input args (first
|
|
// 16 VGPR), use the InputEna and InputAddr bits to define how many
|
|
// VGPRs are actually used.
|
|
// Any extra VGPR arguments are handled as normal arguments (and
|
|
// contribute to the VGPR count whether they're used or not).
|
|
if (IsPixelShader && PSArgCount < 16) {
|
|
if ((1 << PSArgCount) & InputAddr) {
|
|
if (PSArgCount < LastEna)
|
|
WaveDispatchNumVGPR += NumRegs;
|
|
else
|
|
IntermediateVGPR += NumRegs;
|
|
}
|
|
PSArgCount++;
|
|
} else {
|
|
// If there are extra arguments we have to include the allocation for
|
|
// the non-used (but enabled with InputAddr) input arguments
|
|
if (IntermediateVGPR) {
|
|
WaveDispatchNumVGPR += IntermediateVGPR;
|
|
IntermediateVGPR = 0;
|
|
}
|
|
WaveDispatchNumVGPR += NumRegs;
|
|
}
|
|
}
|
|
}
|
|
ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
|
|
{ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
|
|
|
|
ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
|
|
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
|
|
|
|
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
|
|
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
|
|
} else if (isKernel(F.getCallingConv()) &&
|
|
MFI->getNumKernargPreloadedSGPRs()) {
|
|
// Consider cases where the total number of UserSGPRs with trailing
|
|
// allocated preload SGPRs, is greater than the number of explicitly
|
|
// referenced SGPRs.
|
|
const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
|
|
CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
|
|
ProgInfo.NumSGPR =
|
|
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
|
|
}
|
|
|
|
// Adjust number of registers used to meet default/requested minimum/maximum
|
|
// number of waves per execution unit request.
|
|
unsigned MaxWaves = MFI->getMaxWavesPerEU();
|
|
ProgInfo.NumSGPRsForWavesPerEU =
|
|
AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
|
|
CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
|
|
Ctx);
|
|
ProgInfo.NumVGPRsForWavesPerEU =
|
|
AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
|
|
CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
|
|
Ctx);
|
|
|
|
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
|
|
STM.hasSGPRInitBug()) {
|
|
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
|
|
uint64_t NumSgpr;
|
|
if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
|
|
NumSgpr > MaxAddressableNumSGPRs) {
|
|
// This can happen due to a compiler bug or when using inline asm to use
|
|
// the registers which are usually reserved for vcc etc.
|
|
LLVMContext &Ctx = MF.getFunction().getContext();
|
|
DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
|
|
NumSgpr, MaxAddressableNumSGPRs,
|
|
DS_Error, DK_ResourceLimit);
|
|
Ctx.diagnose(Diag);
|
|
ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
|
|
ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
|
|
}
|
|
}
|
|
|
|
if (STM.hasSGPRInitBug()) {
|
|
ProgInfo.NumSGPR =
|
|
CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
|
|
ProgInfo.NumSGPRsForWavesPerEU =
|
|
CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
|
|
}
|
|
|
|
if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
|
|
LLVMContext &Ctx = MF.getFunction().getContext();
|
|
DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
|
|
MFI->getNumUserSGPRs(),
|
|
STM.getMaxNumUserSGPRs(), DS_Error);
|
|
Ctx.diagnose(Diag);
|
|
}
|
|
|
|
if (MFI->getLDSSize() >
|
|
static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
|
|
LLVMContext &Ctx = MF.getFunction().getContext();
|
|
DiagnosticInfoResourceLimit Diag(
|
|
MF.getFunction(), "local memory", MFI->getLDSSize(),
|
|
STM.getAddressableLocalMemorySize(), DS_Error);
|
|
Ctx.diagnose(Diag);
|
|
}
|
|
// The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
|
|
// (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
|
|
auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
|
|
unsigned Granule) {
|
|
const MCExpr *OneConst = CreateExpr(1ul);
|
|
const MCExpr *GranuleConst = CreateExpr(Granule);
|
|
const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
|
|
const MCExpr *AlignToGPR =
|
|
AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
|
|
const MCExpr *DivGPR =
|
|
MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
|
|
const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
|
|
return SubGPR;
|
|
};
|
|
|
|
ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
|
|
IsaInfo::getSGPREncodingGranule(&STM));
|
|
ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
|
|
IsaInfo::getVGPREncodingGranule(&STM));
|
|
|
|
const SIModeRegisterDefaults Mode = MFI->getMode();
|
|
|
|
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
|
|
// register.
|
|
ProgInfo.FloatMode = getFPMode(Mode);
|
|
|
|
ProgInfo.IEEEMode = Mode.IEEE;
|
|
|
|
// Make clamp modifier on NaN input returns 0.
|
|
ProgInfo.DX10Clamp = Mode.DX10Clamp;
|
|
|
|
unsigned LDSAlignShift;
|
|
if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
|
|
// LDS is allocated in 320 dword blocks.
|
|
LDSAlignShift = 11;
|
|
} else if (STM.getFeatureBits().test(
|
|
FeatureAddressableLocalMemorySize65536)) {
|
|
// LDS is allocated in 128 dword blocks.
|
|
LDSAlignShift = 9;
|
|
} else {
|
|
// LDS is allocated in 64 dword blocks.
|
|
LDSAlignShift = 8;
|
|
}
|
|
|
|
ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
|
|
ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
|
|
|
|
ProgInfo.LDSSize = MFI->getLDSSize();
|
|
ProgInfo.LDSBlocks =
|
|
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
|
|
|
|
// The MCExpr equivalent of divideCeil.
|
|
auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
|
|
const MCExpr *Ceil =
|
|
AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
|
|
return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
|
|
};
|
|
|
|
// Scratch is allocated in 64-dword or 256-dword blocks.
|
|
unsigned ScratchAlignShift =
|
|
STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
|
|
// We need to program the hardware with the amount of scratch memory that
|
|
// is used by the entire wave. ProgInfo.ScratchSize is the amount of
|
|
// scratch memory used per thread.
|
|
ProgInfo.ScratchBlocks = DivideCeil(
|
|
MCBinaryExpr::createMul(ProgInfo.ScratchSize,
|
|
CreateExpr(STM.getWavefrontSize()), Ctx),
|
|
CreateExpr(1ULL << ScratchAlignShift));
|
|
|
|
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
|
|
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
|
|
ProgInfo.MemOrdered = 1;
|
|
ProgInfo.FwdProgress = 1;
|
|
}
|
|
|
|
// 0 = X, 1 = XY, 2 = XYZ
|
|
unsigned TIDIGCompCnt = 0;
|
|
if (MFI->hasWorkItemIDZ())
|
|
TIDIGCompCnt = 2;
|
|
else if (MFI->hasWorkItemIDY())
|
|
TIDIGCompCnt = 1;
|
|
|
|
// The private segment wave byte offset is the last of the system SGPRs. We
|
|
// initially assumed it was allocated, and may have used it. It shouldn't harm
|
|
// anything to disable it if we know the stack isn't used here. We may still
|
|
// have emitted code reading it to initialize scratch, but if that's unused
|
|
// reading garbage should be OK.
|
|
ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
|
|
MCBinaryExpr::createGT(ProgInfo.ScratchBlocks,
|
|
MCConstantExpr::create(0, Ctx), Ctx),
|
|
ProgInfo.DynamicCallStack, Ctx);
|
|
|
|
ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
|
|
// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
|
|
ProgInfo.TrapHandlerEnable =
|
|
STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
|
|
ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
|
|
ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
|
|
ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
|
|
ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
|
|
ProgInfo.TIdIGCompCount = TIDIGCompCnt;
|
|
ProgInfo.EXCPEnMSB = 0;
|
|
// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
|
|
ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
|
|
ProgInfo.EXCPEnable = 0;
|
|
|
|
// return ((Dst & ~Mask) | (Value << Shift))
|
|
auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
|
|
uint32_t Shift) {
|
|
const auto *Shft = MCConstantExpr::create(Shift, Ctx);
|
|
const auto *Msk = MCConstantExpr::create(Mask, Ctx);
|
|
Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
|
|
Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx),
|
|
Ctx);
|
|
return Dst;
|
|
};
|
|
|
|
if (STM.hasGFX90AInsts()) {
|
|
ProgInfo.ComputePGMRSrc3 =
|
|
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
|
|
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
|
|
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
|
|
ProgInfo.ComputePGMRSrc3 =
|
|
SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
|
|
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
|
|
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
|
|
}
|
|
|
|
ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
|
|
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
|
|
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
|
|
|
|
const auto [MinWEU, MaxWEU] =
|
|
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
|
|
uint64_t Occupancy;
|
|
if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
|
|
DiagnosticInfoOptimizationFailure Diag(
|
|
F, F.getSubprogram(),
|
|
"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
|
|
"'" +
|
|
F.getName() + "': desired occupancy was " + Twine(MinWEU) +
|
|
", final occupancy is " + Twine(Occupancy));
|
|
F.getContext().diagnose(Diag);
|
|
}
|
|
|
|
if (isGFX11Plus(STM)) {
|
|
uint32_t CodeSizeInBytes = (uint32_t)std::min(
|
|
ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
|
|
(uint64_t)std::numeric_limits<uint32_t>::max());
|
|
uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
|
|
uint32_t Field, Shift, Width;
|
|
if (isGFX11(STM)) {
|
|
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
|
|
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
|
|
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
|
|
} else {
|
|
Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
|
|
Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
|
|
Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
|
|
}
|
|
uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
|
|
ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
|
|
CreateExpr(InstPrefSize), Field, Shift);
|
|
}
|
|
}
|
|
|
|
static unsigned getRsrcReg(CallingConv::ID CallConv) {
|
|
switch (CallConv) {
|
|
default: [[fallthrough]];
|
|
case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
|
|
case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
|
|
case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
|
|
case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
|
|
case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
|
|
case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
|
|
case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
|
|
}
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
|
|
const SIProgramInfo &CurrentProgramInfo) {
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
|
|
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
|
|
MCContext &Ctx = MF.getContext();
|
|
|
|
// (((Value) & Mask) << Shift)
|
|
auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
|
|
const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
|
|
const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
|
|
return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx),
|
|
shft, Ctx);
|
|
};
|
|
|
|
auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
|
|
int64_t Val;
|
|
if (Value->evaluateAsAbsolute(Val))
|
|
OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
|
|
else
|
|
OutStreamer->emitValue(Value, Size);
|
|
};
|
|
|
|
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
|
|
OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
|
|
|
|
EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
|
|
/*Size=*/4);
|
|
|
|
OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
|
|
EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
|
|
|
|
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
|
|
|
|
// Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
|
|
// appropriate generation.
|
|
if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
|
|
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
|
|
/*Mask=*/0x3FFFF, /*Shift=*/12),
|
|
/*Size=*/4);
|
|
} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
|
|
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
|
|
/*Mask=*/0x7FFF, /*Shift=*/12),
|
|
/*Size=*/4);
|
|
} else {
|
|
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
|
|
/*Mask=*/0x1FFF, /*Shift=*/12),
|
|
/*Size=*/4);
|
|
}
|
|
|
|
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
|
|
// 0" comment but I don't see a corresponding field in the register spec.
|
|
} else {
|
|
OutStreamer->emitInt32(RsrcReg);
|
|
|
|
const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
|
|
SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
|
|
SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
|
|
MF.getContext());
|
|
EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
|
|
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
|
|
|
|
// Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
|
|
// appropriate generation.
|
|
if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
|
|
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
|
|
/*Mask=*/0x3FFFF, /*Shift=*/12),
|
|
/*Size=*/4);
|
|
} else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
|
|
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
|
|
/*Mask=*/0x7FFF, /*Shift=*/12),
|
|
/*Size=*/4);
|
|
} else {
|
|
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
|
|
/*Mask=*/0x1FFF, /*Shift=*/12),
|
|
/*Size=*/4);
|
|
}
|
|
}
|
|
|
|
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
|
|
OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
|
|
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
|
|
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
|
|
: CurrentProgramInfo.LDSBlocks;
|
|
OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
|
|
OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
|
|
OutStreamer->emitInt32(MFI->getPSInputEnable());
|
|
OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
|
|
OutStreamer->emitInt32(MFI->getPSInputAddr());
|
|
}
|
|
|
|
OutStreamer->emitInt32(R_SPILLED_SGPRS);
|
|
OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
|
|
OutStreamer->emitInt32(R_SPILLED_VGPRS);
|
|
OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
|
|
}
|
|
|
|
// Helper function to add common PAL Metadata 3.0+
|
|
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
|
|
const SIProgramInfo &CurrentProgramInfo,
|
|
CallingConv::ID CC, const GCNSubtarget &ST) {
|
|
if (ST.hasIEEEMode())
|
|
MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
|
|
|
|
MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
|
|
MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
|
|
|
|
if (AMDGPU::isCompute(CC)) {
|
|
MD->setHwStage(CC, ".trap_present",
|
|
(bool)CurrentProgramInfo.TrapHandlerEnable);
|
|
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
|
|
|
|
if (ST.isDynamicVGPREnabled())
|
|
MD->setComputeRegisters(".dynamic_vgpr_en", true);
|
|
}
|
|
|
|
MD->setHwStage(CC, ".lds_size",
|
|
(unsigned)(CurrentProgramInfo.LdsSize *
|
|
getLdsDwGranularity(ST) * sizeof(uint32_t)));
|
|
}
|
|
|
|
// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
|
|
// is AMDPAL. It stores each compute/SPI register setting and other PAL
|
|
// metadata items into the PALMD::Metadata, combining with any provided by the
|
|
// frontend as LLVM metadata. Once all functions are written, the PAL metadata
|
|
// is then written as a single block in the .note section.
|
|
void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
|
|
const SIProgramInfo &CurrentProgramInfo) {
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
auto CC = MF.getFunction().getCallingConv();
|
|
auto *MD = getTargetStreamer()->getPALMetadata();
|
|
auto &Ctx = MF.getContext();
|
|
|
|
MD->setEntryPoint(CC, MF.getFunction().getName());
|
|
MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
|
|
|
|
// For targets that support dynamic VGPRs, set the number of saved dynamic
|
|
// VGPRs (if any) in the PAL metadata.
|
|
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
|
|
if (STM.isDynamicVGPREnabled() &&
|
|
MFI->getScratchReservedForDynamicVGPRs() > 0)
|
|
MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
|
|
MFI->getScratchReservedForDynamicVGPRs() / 4);
|
|
|
|
// Only set AGPRs for supported devices
|
|
if (STM.hasMAIInsts()) {
|
|
MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
|
|
}
|
|
|
|
MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
|
|
if (MD->getPALMajorVersion() < 3) {
|
|
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
|
|
if (AMDGPU::isCompute(CC)) {
|
|
MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
|
|
} else {
|
|
const MCExpr *HasScratchBlocks =
|
|
MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
|
|
MCConstantExpr::create(0, Ctx), Ctx);
|
|
auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
|
|
MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
|
|
}
|
|
} else {
|
|
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
|
|
MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
|
|
CurrentProgramInfo.ScratchEnable);
|
|
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
|
|
}
|
|
|
|
// ScratchSize is in bytes, 16 aligned.
|
|
MD->setScratchSize(
|
|
CC,
|
|
AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
|
|
MCConstantExpr::create(16, Ctx), Ctx),
|
|
Ctx);
|
|
|
|
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
|
|
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
|
|
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
|
|
: CurrentProgramInfo.LDSBlocks;
|
|
if (MD->getPALMajorVersion() < 3) {
|
|
MD->setRsrc2(
|
|
CC,
|
|
MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx),
|
|
Ctx);
|
|
MD->setSpiPsInputEna(MFI->getPSInputEnable());
|
|
MD->setSpiPsInputAddr(MFI->getPSInputAddr());
|
|
} else {
|
|
// Graphics registers
|
|
const unsigned ExtraLdsDwGranularity =
|
|
STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
|
|
MD->setGraphicsRegisters(
|
|
".ps_extra_lds_size",
|
|
(unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
|
|
|
|
// Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
|
|
static StringLiteral const PsInputFields[] = {
|
|
".persp_sample_ena", ".persp_center_ena",
|
|
".persp_centroid_ena", ".persp_pull_model_ena",
|
|
".linear_sample_ena", ".linear_center_ena",
|
|
".linear_centroid_ena", ".line_stipple_tex_ena",
|
|
".pos_x_float_ena", ".pos_y_float_ena",
|
|
".pos_z_float_ena", ".pos_w_float_ena",
|
|
".front_face_ena", ".ancillary_ena",
|
|
".sample_coverage_ena", ".pos_fixed_pt_ena"};
|
|
unsigned PSInputEna = MFI->getPSInputEnable();
|
|
unsigned PSInputAddr = MFI->getPSInputAddr();
|
|
for (auto [Idx, Field] : enumerate(PsInputFields)) {
|
|
MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
|
|
(bool)((PSInputEna >> Idx) & 1));
|
|
MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
|
|
(bool)((PSInputAddr >> Idx) & 1));
|
|
}
|
|
}
|
|
}
|
|
|
|
// For version 3 and above the wave front size is already set in the metadata
|
|
if (MD->getPALMajorVersion() < 3 && STM.isWave32())
|
|
MD->setWave32(MF.getFunction().getCallingConv());
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
|
|
auto *MD = getTargetStreamer()->getPALMetadata();
|
|
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
StringRef FnName = MF.getFunction().getName();
|
|
MD->setFunctionScratchSize(FnName, MFI.getStackSize());
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
MCContext &Ctx = MF.getContext();
|
|
|
|
if (MD->getPALMajorVersion() < 3) {
|
|
// Set compute registers
|
|
MD->setRsrc1(
|
|
CallingConv::AMDGPU_CS,
|
|
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
|
|
MD->setRsrc2(CallingConv::AMDGPU_CS,
|
|
CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
|
|
} else {
|
|
EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
|
|
}
|
|
|
|
// Set optional info
|
|
MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
|
|
MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
|
|
MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
|
|
}
|
|
|
|
// This is supposed to be log2(Size)
|
|
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
|
|
switch (Size) {
|
|
case 4:
|
|
return AMD_ELEMENT_4_BYTES;
|
|
case 8:
|
|
return AMD_ELEMENT_8_BYTES;
|
|
case 16:
|
|
return AMD_ELEMENT_16_BYTES;
|
|
default:
|
|
llvm_unreachable("invalid private_element_size");
|
|
}
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
|
|
const SIProgramInfo &CurrentProgramInfo,
|
|
const MachineFunction &MF) const {
|
|
const Function &F = MF.getFunction();
|
|
assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
|
|
F.getCallingConv() == CallingConv::SPIR_KERNEL);
|
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
|
|
MCContext &Ctx = MF.getContext();
|
|
|
|
Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
|
|
|
|
Out.compute_pgm_resource1_registers =
|
|
CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
|
|
Out.compute_pgm_resource2_registers =
|
|
CurrentProgramInfo.getComputePGMRSrc2(Ctx);
|
|
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
|
|
|
|
Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
|
|
|
|
AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
|
|
getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
|
|
|
|
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
|
|
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
|
|
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
|
|
}
|
|
|
|
if (UserSGPRInfo.hasDispatchPtr())
|
|
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
|
|
|
|
if (UserSGPRInfo.hasQueuePtr())
|
|
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
|
|
|
|
if (UserSGPRInfo.hasKernargSegmentPtr())
|
|
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
|
|
|
|
if (UserSGPRInfo.hasDispatchID())
|
|
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
|
|
|
|
if (UserSGPRInfo.hasFlatScratchInit())
|
|
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
|
|
|
|
if (UserSGPRInfo.hasPrivateSegmentSize())
|
|
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
|
|
|
|
if (STM.isXNACKEnabled())
|
|
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
|
|
|
|
Align MaxKernArgAlign;
|
|
Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
|
|
Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
|
|
Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
|
|
Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
|
|
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
|
|
|
|
// kernarg_segment_alignment is specified as log of the alignment.
|
|
// The minimum alignment is 16.
|
|
// FIXME: The metadata treats the minimum as 4?
|
|
Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
|
|
}
|
|
|
|
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
|
|
const char *ExtraCode, raw_ostream &O) {
|
|
// First try the generic code, which knows about modifiers like 'c' and 'n'.
|
|
if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
|
|
return false;
|
|
|
|
if (ExtraCode && ExtraCode[0]) {
|
|
if (ExtraCode[1] != 0)
|
|
return true; // Unknown modifier.
|
|
|
|
switch (ExtraCode[0]) {
|
|
case 'r':
|
|
break;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// TODO: Should be able to support other operand types like globals.
|
|
const MachineOperand &MO = MI->getOperand(OpNo);
|
|
if (MO.isReg()) {
|
|
AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
|
|
*MF->getSubtarget().getRegisterInfo());
|
|
return false;
|
|
}
|
|
if (MO.isImm()) {
|
|
int64_t Val = MO.getImm();
|
|
if (AMDGPU::isInlinableIntLiteral(Val)) {
|
|
O << Val;
|
|
} else if (isUInt<16>(Val)) {
|
|
O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
|
|
} else if (isUInt<32>(Val)) {
|
|
O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
|
|
} else {
|
|
O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
|
|
}
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
|
|
AU.addRequired<AMDGPUResourceUsageAnalysis>();
|
|
AU.addPreserved<AMDGPUResourceUsageAnalysis>();
|
|
AU.addRequired<MachineModuleInfoWrapperPass>();
|
|
AU.addPreserved<MachineModuleInfoWrapperPass>();
|
|
AsmPrinter::getAnalysisUsage(AU);
|
|
}
|
|
|
|
void AMDGPUAsmPrinter::emitResourceUsageRemarks(
|
|
const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
|
|
bool isModuleEntryFunction, bool hasMAIInsts) {
|
|
if (!ORE)
|
|
return;
|
|
|
|
const char *Name = "kernel-resource-usage";
|
|
const char *Indent = " ";
|
|
|
|
// If the remark is not specifically enabled, do not output to yaml
|
|
LLVMContext &Ctx = MF.getFunction().getContext();
|
|
if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
|
|
return;
|
|
|
|
// Currently non-kernel functions have no resources to emit.
|
|
if (!isEntryFunctionCC(MF.getFunction().getCallingConv()))
|
|
return;
|
|
|
|
auto EmitResourceUsageRemark = [&](StringRef RemarkName,
|
|
StringRef RemarkLabel, auto Argument) {
|
|
// Add an indent for every line besides the line with the kernel name. This
|
|
// makes it easier to tell which resource usage go with which kernel since
|
|
// the kernel name will always be displayed first.
|
|
std::string LabelStr = RemarkLabel.str() + ": ";
|
|
if (RemarkName != "FunctionName")
|
|
LabelStr = Indent + LabelStr;
|
|
|
|
ORE->emit([&]() {
|
|
return MachineOptimizationRemarkAnalysis(Name, RemarkName,
|
|
MF.getFunction().getSubprogram(),
|
|
&MF.front())
|
|
<< LabelStr << ore::NV(RemarkName, Argument);
|
|
});
|
|
};
|
|
|
|
// FIXME: Formatting here is pretty nasty because clang does not accept
|
|
// newlines from diagnostics. This forces us to emit multiple diagnostic
|
|
// remarks to simulate newlines. If and when clang does accept newlines, this
|
|
// formatting should be aggregated into one remark with newlines to avoid
|
|
// printing multiple diagnostic location and diag opts.
|
|
EmitResourceUsageRemark("FunctionName", "Function Name",
|
|
MF.getFunction().getName());
|
|
EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
|
|
getMCExprStr(CurrentProgramInfo.NumSGPR));
|
|
EmitResourceUsageRemark("NumVGPR", "VGPRs",
|
|
getMCExprStr(CurrentProgramInfo.NumArchVGPR));
|
|
if (hasMAIInsts) {
|
|
EmitResourceUsageRemark("NumAGPR", "AGPRs",
|
|
getMCExprStr(CurrentProgramInfo.NumAccVGPR));
|
|
}
|
|
EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
|
|
getMCExprStr(CurrentProgramInfo.ScratchSize));
|
|
int64_t DynStack;
|
|
bool DynStackEvaluatable =
|
|
CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
|
|
StringRef DynamicStackStr =
|
|
DynStackEvaluatable && DynStack ? "True" : "False";
|
|
EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
|
|
EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
|
|
getMCExprStr(CurrentProgramInfo.Occupancy));
|
|
EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
|
|
CurrentProgramInfo.SGPRSpill);
|
|
EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
|
|
CurrentProgramInfo.VGPRSpill);
|
|
if (isModuleEntryFunction)
|
|
EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
|
|
CurrentProgramInfo.LDSSize);
|
|
}
|