[AMDGPU] Convert AMDGPUResourceUsageAnalysis pass from Module to MF pass (#102913)

Converts AMDGPUResourceUsageAnalysis pass from Module to MachineFunction
pass. Moves function resource info propagation to to MC layer (through
helpers in AMDGPUMCResourceInfo) by generating MCExprs for every
function resource which the emitters have been prepped for.

Fixes https://github.com/llvm/llvm-project/issues/64863
This commit is contained in:
Janek van Oirschot 2024-09-30 11:43:34 +01:00 committed by GitHub
parent 3c85102756
commit c897c13dde
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 1567 additions and 486 deletions

View File

@ -2,7 +2,7 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null
// expected-remark@+10 {{Function Name: foo}}
// expected-remark@+9 {{ SGPRs: 13}}
// expected-remark@+9 {{ TotalSGPRs: 13}}
// expected-remark@+8 {{ VGPRs: 10}}
// expected-remark@+7 {{ AGPRs: 12}}
// expected-remark@+6 {{ ScratchSize [bytes/lane]: 0}}

View File

@ -1757,6 +1757,55 @@ As part of the AMDGPU MC layer, AMDGPU provides the following target specific
=================== ================= ========================================================
Function Resource Usage
-----------------------
A function's resource usage depends on each of its callees' resource usage. The
expressions used to denote resource usage reflect this by propagating each
callees' equivalent expressions. Said expressions are emitted as symbols by the
compiler when compiling to either assembly or object format and should not be
overwritten or redefined.
The following describes all emitted function resource usage symbols:
.. table:: Function Resource Usage:
:name: function-usage-table
===================================== ========= ========================================= ===============================================================================
Symbol Type Description Example
===================================== ========= ========================================= ===============================================================================
<function_name>.num_vgpr Integer Number of VGPRs used by <function_name>, .set foo.num_vgpr, max(32, bar.num_vgpr, baz.num_vgpr)
worst case of itself and its callees'
VGPR use
<function_name>.num_agpr Integer Number of AGPRs used by <function_name>, .set foo.num_agpr, max(35, bar.num_agpr)
worst case of itself and its callees'
AGPR use
<function_name>.numbered_sgpr Integer Number of SGPRs used by <function_name>, .set foo.num_sgpr, 21
worst case of itself and its callees'
SGPR use (without any of the implicitly
used SGPRs)
<function_name>.private_seg_size Integer Total stack size required for .set foo.private_seg_size, 16+max(bar.private_seg_size, baz.private_seg_size)
<function_name>, expression is the
locally used stack size + the worst case
callee
<function_name>.uses_vcc Bool Whether <function_name>, or any of its .set foo.uses_vcc, or(0, bar.uses_vcc)
callees, uses vcc
<function_name>.uses_flat_scratch Bool Whether <function_name>, or any of its .set foo.uses_flat_scratch, 1
callees, uses flat scratch or not
<function_name>.has_dyn_sized_stack Bool Whether <function_name>, or any of its .set foo.has_dyn_sized_stack, 1
callees, is dynamically sized
<function_name>.has_recursion Bool Whether <function_name>, or any of its .set foo.has_recursion, 0
callees, contains recursion
<function_name>.has_indirect_call Bool Whether <function_name>, or any of its .set foo.has_indirect_call, max(0, bar.has_indirect_call)
callees, contains an indirect call
===================================== ========= ========================================= ===============================================================================
Futhermore, three symbols are additionally emitted describing the compilation
unit's worst case (i.e, maxima) ``num_vgpr``, ``num_agpr``, and
``numbered_sgpr`` which may be referenced and used by the aforementioned
symbolic expressions. These three symbols are ``amdgcn.max_num_vgpr``,
``amdgcn.max_num_agpr``, and ``amdgcn.max_num_sgpr``.
.. _amdgpu-elf-code-object:
ELF Code Object

View File

@ -18,6 +18,7 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
#include "AMDGPUHSAMetadataStreamer.h"
#include "AMDGPUMCResourceInfo.h"
#include "AMDGPUResourceUsageAnalysis.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
@ -33,6 +34,7 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/MC/MCAssembler.h"
@ -359,6 +361,127 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
return AsmPrinter::doInitialization(M);
}
void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
return;
using RIK = MCResourceInfo::ResourceInfoKind;
const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
int64_t Val;
if (Value->evaluateAsAbsolute(Val)) {
Res = Val;
return true;
}
return false;
};
const uint64_t MaxScratchPerWorkitem =
STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
MCSymbol *ScratchSizeSymbol =
RI.getSymbol(F.getName(), RIK::RIK_PrivateSegSize, OutContext);
uint64_t ScratchSize;
if (ScratchSizeSymbol->isVariable() &&
TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
ScratchSize > MaxScratchPerWorkitem) {
DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
DS_Error);
F.getContext().diagnose(DiagStackSize);
}
// Validate addressable scalar registers (i.e., prior to added implicit
// SGPRs).
MCSymbol *NumSGPRSymbol =
RI.getSymbol(F.getName(), RIK::RIK_NumSGPR, OutContext);
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
!STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
uint64_t NumSgpr;
if (NumSGPRSymbol->isVariable() &&
TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
NumSgpr > MaxAddressableNumSGPRs) {
DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
NumSgpr, MaxAddressableNumSGPRs,
DS_Error, DK_ResourceLimit);
F.getContext().diagnose(Diag);
return;
}
}
MCSymbol *VCCUsedSymbol =
RI.getSymbol(F.getName(), RIK::RIK_UsesVCC, OutContext);
MCSymbol *FlatUsedSymbol =
RI.getSymbol(F.getName(), RIK::RIK_UsesFlatScratch, OutContext);
uint64_t VCCUsed, FlatUsed, NumSgpr;
if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
FlatUsedSymbol->isVariable() &&
TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
// Recomputes NumSgprs + implicit SGPRs but all symbols should now be
// resolvable.
NumSgpr += IsaInfo::getNumExtraSGPRs(
&STM, VCCUsed, FlatUsed,
getTargetStreamer()->getTargetID()->isXnackOnOrAny());
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
if (NumSgpr > MaxAddressableNumSGPRs) {
DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
MaxAddressableNumSGPRs, DS_Error,
DK_ResourceLimit);
F.getContext().diagnose(Diag);
return;
}
}
MCSymbol *NumVgprSymbol =
RI.getSymbol(F.getName(), RIK::RIK_NumVGPR, OutContext);
MCSymbol *NumAgprSymbol =
RI.getSymbol(F.getName(), RIK::RIK_NumAGPR, OutContext);
uint64_t NumVgpr, NumAgpr;
MachineModuleInfo &MMI =
getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
MachineFunction *MF = MMI.getMachineFunction(F);
if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
unsigned MaxWaves = MFI.getMaxWavesPerEU();
uint64_t TotalNumVgpr =
getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
uint64_t NumVGPRsForWavesPerEU = std::max(
{TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
uint64_t NumSGPRsForWavesPerEU = std::max(
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, MFI.getLDSSize()),
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
OutContext);
uint64_t Occupancy;
const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
F, "amdgpu-waves-per-eu", {0, 0}, true);
if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
DiagnosticInfoOptimizationFailure Diag(
F, F.getSubprogram(),
"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
"'" +
F.getName() + "': desired occupancy was " + Twine(MinWEU) +
", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
return;
}
}
}
}
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Pad with s_code_end to help tools and guard against instruction prefetch
// causing stale data in caches. Arguably this should be done by the linker,
@ -371,25 +494,24 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
getTargetStreamer()->EmitCodeEnd(STI);
}
return AsmPrinter::doFinalization(M);
}
// Assign expressions which can only be resolved when all other functions are
// known.
RI.finalize(OutContext);
// Print comments that apply to both callable functions and entry points.
void AMDGPUAsmPrinter::emitCommonFunctionComments(
uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
const AMDGPUMachineFunction *MFI) {
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
if (NumAGPR) {
OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
false);
}
OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
false);
// Switch section and emit all GPR maximums within the processed module.
OutStreamer->pushSection();
MCSectionELF *MaxGPRSection =
OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
OutStreamer->switchSection(MaxGPRSection);
getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext),
RI.getMaxAGPRSymbol(OutContext),
RI.getMaxSGPRSymbol(OutContext));
OutStreamer->popSection();
for (Function &F : M.functions())
validateMCResourceInfo(F);
return AsmPrinter::doFinalization(M);
}
SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
@ -402,12 +524,14 @@ SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
return Str;
}
// Print comments that apply to both callable functions and entry points.
void AMDGPUAsmPrinter::emitCommonFunctionComments(
const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
const AMDGPUMachineFunction *MFI) {
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false);
OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
false);
OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
if (NumAGPR && TotalNumVGPR) {
OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
@ -540,6 +664,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->switchSection(ConfigSection);
}
const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
ResourceUsage->getResourceInfo();
RI.gatherResourceInfo(MF, Info, OutContext);
if (MFI->isModuleEntryFunction()) {
getSIProgramInfo(CurrentProgramInfo, MF);
}
@ -571,21 +699,44 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
STM.hasMAIInsts());
{
using RIK = MCResourceInfo::ResourceInfoKind;
getTargetStreamer()->EmitMCResourceInfo(
RI.getSymbol(MF.getName(), RIK::RIK_NumVGPR, OutContext),
RI.getSymbol(MF.getName(), RIK::RIK_NumAGPR, OutContext),
RI.getSymbol(MF.getName(), RIK::RIK_NumSGPR, OutContext),
RI.getSymbol(MF.getName(), RIK::RIK_PrivateSegSize, OutContext),
RI.getSymbol(MF.getName(), RIK::RIK_UsesVCC, OutContext),
RI.getSymbol(MF.getName(), RIK::RIK_UsesFlatScratch, OutContext),
RI.getSymbol(MF.getName(), RIK::RIK_HasDynSizedStack, OutContext),
RI.getSymbol(MF.getName(), RIK::RIK_HasRecursion, OutContext),
RI.getSymbol(MF.getName(), RIK::RIK_HasIndirectCall, OutContext));
}
if (isVerbose()) {
MCSectionELF *CommentSection =
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
OutStreamer->switchSection(CommentSection);
if (!MFI->isEntryFunction()) {
using RIK = MCResourceInfo::ResourceInfoKind;
OutStreamer->emitRawComment(" Function info:", false);
const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
ResourceUsage->getResourceInfo(&MF.getFunction());
emitCommonFunctionComments(
Info.NumVGPR,
STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
Info.getTotalNumVGPRs(STM),
Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
RI.getSymbol(MF.getName(), RIK::RIK_NumVGPR, OutContext)
->getVariableValue(),
STM.hasMAIInsts()
? RI.getSymbol(MF.getName(), RIK::RIK_NumAGPR, OutContext)
->getVariableValue()
: nullptr,
RI.createTotalNumVGPRs(MF, Ctx),
RI.createTotalNumSGPRs(
MF,
MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
Ctx),
RI.getSymbol(MF.getName(), RIK::RIK_PrivateSegSize, OutContext)
->getVariableValue(),
getFunctionCodeSize(MF), MFI);
return false;
}
@ -751,10 +902,26 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
return CodeSize;
}
// AccumOffset computed for the MCExpr equivalent of:
// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
// Can't be lower than 1 for subsequent alignTo.
const MCExpr *MaximumTaken =
AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
// Practically, it's computing divideCeil(MaximumTaken, 4).
const MCExpr *DivCeil = MCBinaryExpr::createDiv(
AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
Ctx);
return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
}
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
ResourceUsage->getResourceInfo(&MF.getFunction());
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
MCContext &Ctx = MF.getContext();
@ -771,28 +938,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
return false;
};
ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
ProgInfo.AccumOffset =
CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
ProgInfo.TgSplit = STM.isTgSplitEnabled();
ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
ProgInfo.DynamicCallStack =
CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
auto GetSymRefExpr =
[&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
MCSymbol *Sym = RI.getSymbol(MF.getName(), RIK, OutContext);
return MCSymbolRefExpr::create(Sym, Ctx);
};
const uint64_t MaxScratchPerWorkitem =
STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
uint64_t ScratchSize;
if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
ScratchSize > MaxScratchPerWorkitem) {
DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
MaxScratchPerWorkitem, DS_Error);
MF.getFunction().getContext().diagnose(DiagStackSize);
}
using RIK = MCResourceInfo::ResourceInfoKind;
ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
ProgInfo.TgSplit = STM.isTgSplitEnabled();
ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
ProgInfo.DynamicCallStack =
MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@ -1477,6 +1643,8 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<AMDGPUResourceUsageAnalysis>();
AU.addPreserved<AMDGPUResourceUsageAnalysis>();
AU.addRequired<MachineModuleInfoWrapperPass>();
AU.addPreserved<MachineModuleInfoWrapperPass>();
AsmPrinter::getAnalysisUsage(AU);
}
@ -1522,7 +1690,7 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
// printing multiple diagnostic location and diag opts.
EmitResourceUsageRemark("FunctionName", "Function Name",
MF.getFunction().getName());
EmitResourceUsageRemark("NumSGPR", "SGPRs",
EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
getMCExprStr(CurrentProgramInfo.NumSGPR));
EmitResourceUsageRemark("NumVGPR", "VGPRs",
getMCExprStr(CurrentProgramInfo.NumArchVGPR));

View File

@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
#include "AMDGPUMCResourceInfo.h"
#include "SIProgramInfo.h"
#include "llvm/CodeGen/AsmPrinter.h"
@ -24,6 +25,7 @@ struct AMDGPUResourceUsageAnalysis;
class AMDGPUTargetStreamer;
class MCCodeEmitter;
class MCOperand;
class MCResourceInfo;
namespace AMDGPU {
struct MCKernelDescriptor;
@ -40,6 +42,8 @@ private:
AMDGPUResourceUsageAnalysis *ResourceUsage;
MCResourceInfo RI;
SIProgramInfo CurrentProgramInfo;
std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
@ -60,11 +64,6 @@ private:
void EmitPALMetadata(const MachineFunction &MF,
const SIProgramInfo &KernelInfo);
void emitPALFunctionMetadata(const MachineFunction &MF);
void emitCommonFunctionComments(uint32_t NumVGPR,
std::optional<uint32_t> NumAGPR,
uint32_t TotalNumVGPR, uint32_t NumSGPR,
uint64_t ScratchSize, uint64_t CodeSize,
const AMDGPUMachineFunction *MFI);
void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR,
const MCExpr *TotalNumVGPR,
const MCExpr *NumSGPR,
@ -84,6 +83,11 @@ private:
SmallString<128> getMCExprStr(const MCExpr *Value);
/// Attempts to replace the validation that is missed in getSIProgramInfo due
/// to MCExpr being unknown. Invoked during doFinalization such that the
/// MCResourceInfo symbols are known.
void validateMCResourceInfo(Function &F);
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer);

View File

@ -0,0 +1,224 @@
//===- AMDGPUMCResourceInfo.cpp --- MC Resource Info ----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief MC infrastructure to propagate the function level resource usage
/// info.
///
//===----------------------------------------------------------------------===//
#include "AMDGPUMCResourceInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCSymbol.h"
using namespace llvm;
MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
MCContext &OutContext) {
auto GOCS = [this, FuncName, &OutContext](StringRef Suffix) {
return OutContext.getOrCreateSymbol(FuncName + Twine(Suffix));
};
switch (RIK) {
case RIK_NumVGPR:
return GOCS(".num_vgpr");
case RIK_NumAGPR:
return GOCS(".num_agpr");
case RIK_NumSGPR:
return GOCS(".numbered_sgpr");
case RIK_PrivateSegSize:
return GOCS(".private_seg_size");
case RIK_UsesVCC:
return GOCS(".uses_vcc");
case RIK_UsesFlatScratch:
return GOCS(".uses_flat_scratch");
case RIK_HasDynSizedStack:
return GOCS(".has_dyn_sized_stack");
case RIK_HasRecursion:
return GOCS(".has_recursion");
case RIK_HasIndirectCall:
return GOCS(".has_indirect_call");
}
llvm_unreachable("Unexpected ResourceInfoKind.");
}
const MCExpr *MCResourceInfo::getSymRefExpr(StringRef FuncName,
ResourceInfoKind RIK,
MCContext &Ctx) {
return MCSymbolRefExpr::create(getSymbol(FuncName, RIK, Ctx), Ctx);
}
void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
// Assign expression to get the max register use to the max_num_Xgpr symbol.
MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
auto assignMaxRegSym = [this, &OutContext](MCSymbol *Sym, int32_t RegCount) {
const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
Sym->setVariableValue(MaxExpr);
};
assignMaxRegSym(MaxVGPRSym, MaxVGPR);
assignMaxRegSym(MaxAGPRSym, MaxAGPR);
assignMaxRegSym(MaxSGPRSym, MaxSGPR);
}
void MCResourceInfo::finalize(MCContext &OutContext) {
assert(!Finalized && "Cannot finalize ResourceInfo again.");
Finalized = true;
assignMaxRegs(OutContext);
}
MCSymbol *MCResourceInfo::getMaxVGPRSymbol(MCContext &OutContext) {
return OutContext.getOrCreateSymbol("amdgpu.max_num_vgpr");
}
MCSymbol *MCResourceInfo::getMaxAGPRSymbol(MCContext &OutContext) {
return OutContext.getOrCreateSymbol("amdgpu.max_num_agpr");
}
MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
}
void MCResourceInfo::assignResourceInfoExpr(
int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
const MachineFunction &MF, const SmallVectorImpl<const Function *> &Callees,
MCContext &OutContext) {
const MCConstantExpr *LocalConstExpr =
MCConstantExpr::create(LocalValue, OutContext);
const MCExpr *SymVal = LocalConstExpr;
if (!Callees.empty()) {
SmallVector<const MCExpr *, 8> ArgExprs;
// Avoid recursive symbol assignment.
SmallPtrSet<const Function *, 8> Seen;
ArgExprs.push_back(LocalConstExpr);
const Function &F = MF.getFunction();
Seen.insert(&F);
for (const Function *Callee : Callees) {
if (!Seen.insert(Callee).second)
continue;
MCSymbol *CalleeValSym = getSymbol(Callee->getName(), RIK, OutContext);
ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext));
}
SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext);
}
MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext);
Sym->setVariableValue(SymVal);
}
void MCResourceInfo::gatherResourceInfo(
const MachineFunction &MF,
const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI,
MCContext &OutContext) {
// Worst case VGPR use for non-hardware-entrypoints.
MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
addMaxVGPRCandidate(FRI.NumVGPR);
addMaxAGPRCandidate(FRI.NumAGPR);
addMaxSGPRCandidate(FRI.NumExplicitSGPR);
}
auto SetMaxReg = [&](MCSymbol *MaxSym, int32_t numRegs,
ResourceInfoKind RIK) {
if (!FRI.HasIndirectCall) {
assignResourceInfoExpr(numRegs, RIK, AMDGPUMCExpr::AGVK_Max, MF,
FRI.Callees, OutContext);
} else {
const MCExpr *SymRef = MCSymbolRefExpr::create(MaxSym, OutContext);
MCSymbol *LocalNumSym = getSymbol(MF.getName(), RIK, OutContext);
const MCExpr *MaxWithLocal = AMDGPUMCExpr::createMax(
{MCConstantExpr::create(numRegs, OutContext), SymRef}, OutContext);
LocalNumSym->setVariableValue(MaxWithLocal);
}
};
SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
{
// The expression for private segment size should be: FRI.PrivateSegmentSize
// + max(FRI.Callees, FRI.CalleeSegmentSize)
SmallVector<const MCExpr *, 8> ArgExprs;
if (FRI.CalleeSegmentSize)
ArgExprs.push_back(
MCConstantExpr::create(FRI.CalleeSegmentSize, OutContext));
if (!FRI.HasIndirectCall) {
for (const Function *Callee : FRI.Callees) {
MCSymbol *calleeValSym =
getSymbol(Callee->getName(), RIK_PrivateSegSize, OutContext);
ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext));
}
}
const MCExpr *localConstExpr =
MCConstantExpr::create(FRI.PrivateSegmentSize, OutContext);
if (!ArgExprs.empty()) {
const AMDGPUMCExpr *transitiveExpr =
AMDGPUMCExpr::createMax(ArgExprs, OutContext);
localConstExpr =
MCBinaryExpr::createAdd(localConstExpr, transitiveExpr, OutContext);
}
getSymbol(MF.getName(), RIK_PrivateSegSize, OutContext)
->setVariableValue(localConstExpr);
}
auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) {
MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext);
Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext));
};
if (!FRI.HasIndirectCall) {
assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC,
AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
assignResourceInfoExpr(FRI.UsesFlatScratch,
ResourceInfoKind::RIK_UsesFlatScratch,
AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
assignResourceInfoExpr(FRI.HasDynamicallySizedStack,
ResourceInfoKind::RIK_HasDynSizedStack,
AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
assignResourceInfoExpr(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion,
AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
assignResourceInfoExpr(FRI.HasIndirectCall,
ResourceInfoKind::RIK_HasIndirectCall,
AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
} else {
SetToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
SetToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
SetToLocal(FRI.HasDynamicallySizedStack,
ResourceInfoKind::RIK_HasDynSizedStack);
SetToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion);
SetToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall);
}
}
const MCExpr *MCResourceInfo::createTotalNumVGPRs(const MachineFunction &MF,
MCContext &Ctx) {
return AMDGPUMCExpr::createTotalNumVGPR(
getSymRefExpr(MF.getName(), RIK_NumAGPR, Ctx),
getSymRefExpr(MF.getName(), RIK_NumVGPR, Ctx), Ctx);
}
const MCExpr *MCResourceInfo::createTotalNumSGPRs(const MachineFunction &MF,
bool hasXnack,
MCContext &Ctx) {
return MCBinaryExpr::createAdd(
getSymRefExpr(MF.getName(), RIK_NumSGPR, Ctx),
AMDGPUMCExpr::createExtraSGPRs(
getSymRefExpr(MF.getName(), RIK_UsesVCC, Ctx),
getSymRefExpr(MF.getName(), RIK_UsesFlatScratch, Ctx), hasXnack, Ctx),
Ctx);
}

View File

@ -0,0 +1,102 @@
//===- AMDGPUMCResourceInfo.h ----- MC Resource Info --------------*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief MC infrastructure to propagate the function level resource usage
/// info.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H
#include "AMDGPUResourceUsageAnalysis.h"
#include "MCTargetDesc/AMDGPUMCExpr.h"
namespace llvm {
class MCContext;
class MCSymbol;
class StringRef;
class MachineFunction;
class MCResourceInfo {
public:
enum ResourceInfoKind {
RIK_NumVGPR,
RIK_NumAGPR,
RIK_NumSGPR,
RIK_PrivateSegSize,
RIK_UsesVCC,
RIK_UsesFlatScratch,
RIK_HasDynSizedStack,
RIK_HasRecursion,
RIK_HasIndirectCall
};
private:
int32_t MaxVGPR = 0;
int32_t MaxAGPR = 0;
int32_t MaxSGPR = 0;
// Whether the MCResourceInfo has been finalized through finalize(MCContext
// &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR
// symbols to their final value.
bool Finalized = false;
void assignResourceInfoExpr(int64_t localValue, ResourceInfoKind RIK,
AMDGPUMCExpr::VariantKind Kind,
const MachineFunction &MF,
const SmallVectorImpl<const Function *> &Callees,
MCContext &OutContext);
// Assigns expression for Max S/V/A-GPRs to the referenced symbols.
void assignMaxRegs(MCContext &OutContext);
public:
MCResourceInfo() = default;
void addMaxVGPRCandidate(int32_t candidate) {
MaxVGPR = std::max(MaxVGPR, candidate);
}
void addMaxAGPRCandidate(int32_t candidate) {
MaxAGPR = std::max(MaxAGPR, candidate);
}
void addMaxSGPRCandidate(int32_t candidate) {
MaxSGPR = std::max(MaxSGPR, candidate);
}
MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK,
MCContext &OutContext);
const MCExpr *getSymRefExpr(StringRef FuncName, ResourceInfoKind RIK,
MCContext &Ctx);
// Resolves the final symbols that requires the inter-function resource info
// to be resolved.
void finalize(MCContext &OutContext);
MCSymbol *getMaxVGPRSymbol(MCContext &OutContext);
MCSymbol *getMaxAGPRSymbol(MCContext &OutContext);
MCSymbol *getMaxSGPRSymbol(MCContext &OutContext);
/// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
/// granularity. However, some resource info has to be assigned the call
/// transitive maximum or accumulative. For example, if A calls B and B's VGPR
/// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore,
/// functions with indirect calls should be assigned the module level maximum.
void gatherResourceInfo(
const MachineFunction &MF,
const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI,
MCContext &OutContext);
const MCExpr *createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx);
const MCExpr *createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack,
MCContext &Ctx);
};
} // namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H

View File

@ -13,14 +13,6 @@
/// The results of this analysis are used to fill the register usage, flat
/// usage, etc. into hardware registers.
///
/// The analysis takes callees into account. E.g. if a function A that needs 10
/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
/// will return 20.
/// It is assumed that an indirect call can go into any function except
/// hardware-entrypoints. Therefore the register usage of functions with
/// indirect calls is estimated as the maximum of all non-entrypoint functions
/// in the module.
///
//===----------------------------------------------------------------------===//
#include "AMDGPUResourceUsageAnalysis.h"
@ -28,8 +20,8 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalValue.h"
@ -78,92 +70,37 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
return false;
}
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
const GCNSubtarget &ST) const {
return NumExplicitSGPR +
IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
ST.getTargetID().isXnackOnOrAny());
}
int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
const GCNSubtarget &ST) const {
return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), NumAGPR, NumVGPR);
}
bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;
MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
const TargetMachine &TM = TPC->getTM<TargetMachine>();
const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
bool HasIndirectCall = false;
CallGraph CG = CallGraph(M);
auto End = po_end(&CG);
// By default, for code object v5 and later, track only the minimum scratch
// size
uint32_t AssumedStackSizeForDynamicSizeObjects =
clAssumedStackSizeForDynamicSizeObjects;
uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
AMDGPU::AMDHSA_COV5 ||
STI.getTargetTriple().getOS() == Triple::AMDPAL) {
if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0)
if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
AssumedStackSizeForDynamicSizeObjects = 0;
if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0)
if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
AssumedStackSizeForExternalCall = 0;
}
for (auto IT = po_begin(&CG); IT != End; ++IT) {
Function *F = IT->getFunction();
if (!F || F->isDeclaration())
continue;
MachineFunction *MF = MMI.getMachineFunction(*F);
assert(MF && "function must have been generated already");
auto CI =
CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
SIFunctionResourceInfo &Info = CI.first->second;
assert(CI.second && "should only be called once per function");
Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
AssumedStackSizeForExternalCall);
HasIndirectCall |= Info.HasIndirectCall;
}
// It's possible we have unreachable functions in the module which weren't
// visited by the PO traversal. Make sure we have some resource counts to
// report.
for (const auto &IT : CG) {
const Function *F = IT.first;
if (!F || F->isDeclaration())
continue;
auto CI =
CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
if (!CI.second) // Skip already visited functions
continue;
SIFunctionResourceInfo &Info = CI.first->second;
MachineFunction *MF = MMI.getMachineFunction(*F);
assert(MF && "function must have been generated already");
Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
AssumedStackSizeForExternalCall);
HasIndirectCall |= Info.HasIndirectCall;
}
if (HasIndirectCall)
propagateIndirectCallRegisterUsage();
ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects,
AssumedStackSizeForExternalCall);
return false;
}
AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
const MachineFunction &MF, const TargetMachine &TM,
uint32_t AssumedStackSizeForDynamicSizeObjects,
const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
uint32_t AssumedStackSizeForExternalCall) const {
SIFunctionResourceInfo Info;
@ -253,7 +190,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
int32_t MaxVGPR = -1;
int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
uint64_t CalleeFrameSize = 0;
Info.CalleeSegmentSize = 0;
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
@ -512,8 +449,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
TII->getNamedOperand(MI, AMDGPU::OpName::callee);
const Function *Callee = getCalleeFunction(*CalleeOp);
DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
CallGraphResourceInfo.end();
// Avoid crashing on undefined behavior with an illegal call to a
// kernel. If a callsite's calling convention doesn't match the
@ -522,9 +457,14 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
report_fatal_error("invalid call to entry function");
auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
return F == &MF.getFunction();
};
if (Callee && !isSameFunction(MF, Callee))
Info.Callees.push_back(Callee);
bool IsIndirect = !Callee || Callee->isDeclaration();
if (!IsIndirect)
I = CallGraphResourceInfo.find(Callee);
// FIXME: Call site could have norecurse on it
if (!Callee || !Callee->doesNotRecurse()) {
@ -539,15 +479,15 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
// directly call the tail called function. If a kernel directly
// calls a tail recursive function, we'll assume maximum stack size
// based on the regular call instruction.
CalleeFrameSize = std::max(
CalleeFrameSize,
Info.CalleeSegmentSize = std::max(
Info.CalleeSegmentSize,
static_cast<uint64_t>(AssumedStackSizeForExternalCall));
}
}
if (IsIndirect || I == CallGraphResourceInfo.end()) {
CalleeFrameSize =
std::max(CalleeFrameSize,
if (IsIndirect) {
Info.CalleeSegmentSize =
std::max(Info.CalleeSegmentSize,
static_cast<uint64_t>(AssumedStackSizeForExternalCall));
// Register usage of indirect calls gets handled later
@ -555,19 +495,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
Info.HasDynamicallySizedStack = true;
Info.HasIndirectCall = true;
} else {
// We force CodeGen to run in SCC order, so the callee's register
// usage etc. should be the cumulative usage of all callees.
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
CalleeFrameSize =
std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
Info.UsesVCC |= I->second.UsesVCC;
Info.UsesFlatScratch |= I->second.UsesFlatScratch;
Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
Info.HasRecursion |= I->second.HasRecursion;
Info.HasIndirectCall |= I->second.HasIndirectCall;
}
}
}
@ -576,36 +503,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
Info.NumAGPR = MaxAGPR + 1;
Info.PrivateSegmentSize += CalleeFrameSize;
return Info;
}
void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
// Collect the maximum number of registers from non-hardware-entrypoints.
// All these functions are potential targets for indirect calls.
int32_t NonKernelMaxSGPRs = 0;
int32_t NonKernelMaxVGPRs = 0;
int32_t NonKernelMaxAGPRs = 0;
for (const auto &I : CallGraphResourceInfo) {
if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
auto &Info = I.getSecond();
NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
}
}
// Add register usage for functions with indirect calls.
// For calls to unknown functions, we assume the maximum register usage of
// all non-hardware-entrypoints in the current module.
for (auto &I : CallGraphResourceInfo) {
auto &Info = I.getSecond();
if (Info.HasIndirectCall) {
Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
}
}
}

View File

@ -15,8 +15,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
namespace llvm {
@ -24,10 +24,9 @@ class GCNSubtarget;
class MachineFunction;
class TargetMachine;
struct AMDGPUResourceUsageAnalysis : public ModulePass {
static char ID;
struct AMDGPUResourceUsageAnalysis : public MachineFunctionPass {
public:
static char ID;
// Track resource usage for callee functions.
struct SIFunctionResourceInfo {
// Track the number of explicitly used VGPRs. Special registers reserved at
@ -35,48 +34,33 @@ public:
int32_t NumVGPR = 0;
int32_t NumAGPR = 0;
int32_t NumExplicitSGPR = 0;
uint64_t CalleeSegmentSize = 0;
uint64_t PrivateSegmentSize = 0;
bool UsesVCC = false;
bool UsesFlatScratch = false;
bool HasDynamicallySizedStack = false;
bool HasRecursion = false;
bool HasIndirectCall = false;
int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
// Total number of VGPRs is actually a combination of AGPR and VGPR
// depending on architecture - and some alignment constraints
int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
SmallVector<const Function *, 16> Callees;
};
AMDGPUResourceUsageAnalysis() : ModulePass(ID) {}
AMDGPUResourceUsageAnalysis() : MachineFunctionPass(ID) {}
bool doInitialization(Module &M) override {
CallGraphResourceInfo.clear();
return ModulePass::doInitialization(M);
}
bool runOnMachineFunction(MachineFunction &MF) override;
bool runOnModule(Module &M) override;
const SIFunctionResourceInfo &getResourceInfo() const { return ResourceInfo; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineModuleInfoWrapperPass>();
AU.setPreservesAll();
}
const SIFunctionResourceInfo &getResourceInfo(const Function *F) const {
auto Info = CallGraphResourceInfo.find(F);
assert(Info != CallGraphResourceInfo.end() &&
"Failed to find resource info for function");
return Info->getSecond();
MachineFunctionPass::getAnalysisUsage(AU);
}
private:
SIFunctionResourceInfo
analyzeResourceUsage(const MachineFunction &MF, const TargetMachine &TM,
analyzeResourceUsage(const MachineFunction &MF,
uint32_t AssumedStackSizeForDynamicSizeObjects,
uint32_t AssumedStackSizeForExternalCall) const;
void propagateIndirectCallRegisterUsage();
DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
SIFunctionResourceInfo ResourceInfo;
};
} // namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H

View File

@ -81,6 +81,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMCInstLower.cpp
AMDGPUMemoryUtils.cpp
AMDGPUIGroupLP.cpp
AMDGPUMCResourceInfo.cpp
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp

View File

@ -271,6 +271,47 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
<< Alignment.value() << '\n';
}
void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
const MCSymbol *HasIndirectCall) {
#define PRINT_RES_INFO(ARG) \
OS << "\t.set "; \
ARG->print(OS, getContext().getAsmInfo()); \
OS << ", "; \
ARG->getVariableValue()->print(OS, getContext().getAsmInfo()); \
Streamer.addBlankLine();
PRINT_RES_INFO(NumVGPR);
PRINT_RES_INFO(NumAGPR);
PRINT_RES_INFO(NumExplicitSGPR);
PRINT_RES_INFO(PrivateSegmentSize);
PRINT_RES_INFO(UsesVCC);
PRINT_RES_INFO(UsesFlatScratch);
PRINT_RES_INFO(HasDynamicallySizedStack);
PRINT_RES_INFO(HasRecursion);
PRINT_RES_INFO(HasIndirectCall);
#undef PRINT_RES_INFO
}
void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
const MCSymbol *MaxAGPR,
const MCSymbol *MaxSGPR) {
#define PRINT_RES_INFO(ARG) \
OS << "\t.set "; \
ARG->print(OS, getContext().getAsmInfo()); \
OS << ", "; \
ARG->getVariableValue()->print(OS, getContext().getAsmInfo()); \
Streamer.addBlankLine();
PRINT_RES_INFO(MaxVGPR);
PRINT_RES_INFO(MaxAGPR);
PRINT_RES_INFO(MaxSGPR);
#undef PRINT_RES_INFO
}
bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n";
return true;

View File

@ -60,6 +60,17 @@ public:
virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, Align Alignment) {
}
virtual void EmitMCResourceInfo(
const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
const MCSymbol *HasIndirectCall) {};
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
const MCSymbol *MaxAGPR,
const MCSymbol *MaxSGPR) {};
/// \returns True on success, false on failure.
virtual bool EmitISAVersion() { return true; }
@ -136,6 +147,18 @@ public:
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
const MCSymbol *NumExplicitSGPR,
const MCSymbol *PrivateSegmentSize,
const MCSymbol *UsesVCC,
const MCSymbol *UsesFlatScratch,
const MCSymbol *HasDynamicallySizedStack,
const MCSymbol *HasRecursion,
const MCSymbol *HasIndirectCall) override;
void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
const MCSymbol *MaxSGPR) override;
/// \returns True on success, false on failure.
bool EmitISAVersion() override;

View File

@ -215,15 +215,15 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val,
const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
}
ExprIt->getSecond() = Val;
} else if (N.getKind() == msgpack::Type::UInt) {
const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
int64_t Unused;
if (!Val->evaluateAsAbsolute(Unused))
REM[Reg] = Val;
(void)Unused;
} else {
// Default to uint64_t 0 so additional calls to setRegister will allow
// propagate ORs.
N = (uint64_t)0;
}
REM[Reg] = Val;
DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val);
}

View File

@ -154,25 +154,28 @@ bb:
declare void @undef_func()
; GCN-LABEL: {{^}}kernel_call_undef_func:
; GFX908: .amdhsa_next_free_vgpr 32
; GFX90A: .amdhsa_next_free_vgpr 64
; GFX90A: .amdhsa_accum_offset 32
; GCN: NumVgprs: 32
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
; GFX90A: TotalNumVgprs: 64
; GFX908: VGPRBlocks: 7
; GFX90A: VGPRBlocks: 7
; GFX908: NumVGPRsForWavesPerEU: 32
; GFX90A: NumVGPRsForWavesPerEU: 64
; GFX90A: AccumOffset: 32
; GFX908: Occupancy: 8
; GFX90A: Occupancy: 8
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
; GCN: .set kernel_call_undef_func.num_vgpr, max(32, amdgpu.max_num_vgpr)
; GCN: .set kernel_call_undef_func.num_agpr, max(0, amdgpu.max_num_agpr)
; GCN: NumVgprs: kernel_call_undef_func.num_vgpr
; GCN: NumAgprs: kernel_call_undef_func.num_agpr
; GCN: TotalNumVgprs: totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr)
; GFX908: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 4))/4)-1
; GFX90A: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 8))/8)-1
; GCN: NumVGPRsForWavesPerEU: max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
; GFX90A: AccumOffset: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)+1)*4
; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
call void @undef_func()
ret void
}
; GCN: .set amdgpu.max_num_vgpr, 32
; GCN-NEXT: .set amdgpu.max_num_agpr, 32
; GCN-NEXT: .set amdgpu.max_num_sgpr, 34
attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

View File

@ -11,7 +11,7 @@
; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
; ASM: .amdhsa_user_sgpr_count 10
; ASM: .amdhsa_next_free_sgpr 10
; ASM: ; NumSgprs: 16
; ASM: ; TotalNumSgprs: 16
; ASM: ; NumSGPRsForWavesPerEU: 16
; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
@ -31,7 +31,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret
; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
; ASM: .amdhsa_user_sgpr_count 10
; ASM: .amdhsa_next_free_sgpr 10
; ASM: ; NumSgprs: 16
; ASM: ; TotalNumSgprs: 16
; ASM: ; NumSGPRsForWavesPerEU: 16
; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
@ -47,7 +47,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 {
; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
; ASM: .amdhsa_user_sgpr_count 3
; ASM: .amdhsa_next_free_sgpr 3
; ASM: ; NumSgprs: 9
; ASM: ; TotalNumSgprs: 9
; ASM: ; NumSGPRsForWavesPerEU: 9
; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
@ -62,7 +62,7 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
; ASM: .amdhsa_user_sgpr_count 2
; ASM: .amdhsa_next_free_sgpr 0
; ASM: ; NumSgprs: 6
; ASM: ; TotalNumSgprs: 6
; ASM: ; NumSGPRsForWavesPerEU: 6
; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.

View File

@ -60,7 +60,9 @@ bb:
declare void @undef_func()
; CHECK: .type kernel_call_undef_func
; CHECK: NumAgprs: 32
; CHECK: .set kernel_call_undef_func.num_agpr, max(0, amdgpu.max_num_agpr)
; CHECK: NumAgprs: kernel_call_undef_func.num_agpr
; CHECK: .set amdgpu.max_num_agpr, 32
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
call void @undef_func()

View File

@ -547,18 +547,20 @@ define amdgpu_kernel void @f256() #256 {
attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
; GCN-LABEL: {{^}}f512:
; GFX9: NumVgprs: 128
; GFX90A: NumVgprs: 128
; GFX90A: NumAgprs: 128
; GFX90A: TotalNumVgprs: 256
; GFX10WGP-WAVE32: NumVgprs: 256
; GFX10WGP-WAVE64: NumVgprs: 256
; GFX10CU-WAVE32: NumVgprs: 128
; GFX10CU-WAVE64: NumVgprs: 128
; GFX11WGP-WAVE32: NumVgprs: 256
; GFX11WGP-WAVE64: NumVgprs: 256
; GFX11CU-WAVE32: NumVgprs: 192
; GFX11CU-WAVE64: NumVgprs: 192
; GFX9: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr)
; GFX90A: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr)
; GFX90A: .set f512.num_agpr, max(128, amdgpu.max_num_agpr)
; GFX10WGP-WAVE32: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr)
; GFX10WGP-WAVE64: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr)
; GFX10CU-WAVE32: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr)
; GFX10CU-WAVE64: .set f512.num_vgpr, max(128, amdgpu.max_num_vgpr)
; GFX11WGP-WAVE32: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr)
; GFX11WGP-WAVE64: .set f512.num_vgpr, max(256, amdgpu.max_num_vgpr)
; GFX11CU-WAVE32: .set f512.num_vgpr, max(192, amdgpu.max_num_vgpr)
; GFX11CU-WAVE64: .set f512.num_vgpr, max(192, amdgpu.max_num_vgpr)
; GCN: NumVgprs: f512.num_vgpr
; GFX90A: NumAgprs: f512.num_agpr
; GFX90A: TotalNumVgprs: totalnumvgprs(f512.num_agpr, f512.num_vgpr)
define amdgpu_kernel void @f512() #512 {
call void @foo()
call void @use256vgprs()
@ -567,17 +569,20 @@ define amdgpu_kernel void @f512() #512 {
attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
; GCN-LABEL: {{^}}f1024:
; GFX9: NumVgprs: 64
; GFX90A: NumAgprs: 64
; GFX90A: TotalNumVgprs: 128
; GFX10WGP-WAVE32: NumVgprs: 128
; GFX10WGP-WAVE64: NumVgprs: 128
; GFX10CU-WAVE32: NumVgprs: 64
; GFX10CU-WAVE64: NumVgprs: 64
; GFX11WGP-WAVE32: NumVgprs: 192
; GFX11WGP-WAVE64: NumVgprs: 192
; GFX11CU-WAVE32: NumVgprs: 96
; GFX11CU-WAVE64: NumVgprs: 96
; GFX9: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr)
; GFX90A: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr)
; GFX90A: .set f1024.num_agpr, max(64, amdgpu.max_num_agpr)
; GFX10WGP-WAVE32: .set f1024.num_vgpr, max(128, amdgpu.max_num_vgpr)
; GFX10WGP-WAVE64: .set f1024.num_vgpr, max(128, amdgpu.max_num_vgpr)
; GFX10CU-WAVE32: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr)
; GFX10CU-WAVE64: .set f1024.num_vgpr, max(64, amdgpu.max_num_vgpr)
; GFX11WGP-WAVE32: .set f1024.num_vgpr, max(192, amdgpu.max_num_vgpr)
; GFX11WGP-WAVE64: .set f1024.num_vgpr, max(192, amdgpu.max_num_vgpr)
; GFX11CU-WAVE32: .set f1024.num_vgpr, max(96, amdgpu.max_num_vgpr)
; GFX11CU-WAVE64: .set f1024.num_vgpr, max(96, amdgpu.max_num_vgpr)
; GCN: NumVgprs: f1024.num_vgpr
; GFX90A: NumAgprs: f1024.num_agpr
; GFX90A: TotalNumVgprs: totalnumvgprs(f1024.num_agpr, f1024.num_vgpr)
define amdgpu_kernel void @f1024() #1024 {
call void @foo()
call void @use256vgprs()

View File

@ -1,4 +1,4 @@
; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=ALL,GFX908 %s
; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=ALL %s
; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL,GFX90A %s
; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
@ -8,12 +8,13 @@
@alias = hidden alias void (), ptr @aliasee_default
; ALL-LABEL: {{^}}kernel:
; GFX908: .amdhsa_next_free_vgpr 32
; GFX908-NEXT: .amdhsa_next_free_sgpr 33
; ALL: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel.num_agpr, kernel.num_vgpr), 1, 0)
; ALL-NEXT: .amdhsa_next_free_sgpr (max(kernel.numbered_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1))
; GFX90A-NEXT: .amdhsa_accum_offset ((((((alignto(max(1, kernel.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
; GFX90A: .amdhsa_next_free_vgpr 59
; GFX90A-NEXT: .amdhsa_next_free_sgpr 33
; GFX90A-NEXT: .amdhsa_accum_offset 32
; ALL: .set kernel.num_vgpr, max(32, aliasee_default.num_vgpr)
; ALL-NEXT: .set kernel.num_agpr, max(0, aliasee_default.num_agpr)
; ALL-NEXT: .set kernel.numbered_sgpr, max(33, aliasee_default.numbered_sgpr)
define amdgpu_kernel void @kernel() #0 {
bb:
call void @alias() #2
@ -25,6 +26,9 @@ bb:
call void asm sideeffect "; clobber a26 ", "~{a26}"()
ret void
}
; ALL: .set aliasee_default.num_vgpr, 0
; ALL-NEXT: .set aliasee_default.num_agpr, 27
; ALL-NEXT: .set aliasee_default.numbered_sgpr, 32
attributes #0 = { noinline norecurse nounwind optnone }
attributes #1 = { noinline norecurse nounwind readnone willreturn }

View File

@ -7,14 +7,18 @@
@alias0 = hidden alias void (), ptr @aliasee_default_vgpr64_sgpr102
; CHECK-LABEL: {{^}}kernel0:
; CHECK: .amdhsa_next_free_vgpr 53
; CHECK-NEXT: .amdhsa_next_free_sgpr 33
; CHECK: .set kernel0.num_vgpr, max(32, aliasee_default_vgpr64_sgpr102.num_vgpr)
; CHECK-NEXT: .set kernel0.num_agpr, max(0, aliasee_default_vgpr64_sgpr102.num_agpr)
; CHECK-NEXT: .set kernel0.numbered_sgpr, max(33, aliasee_default_vgpr64_sgpr102.numbered_sgpr)
define amdgpu_kernel void @kernel0() #0 {
bb:
call void @alias0() #2
ret void
}
; CHECK: .set aliasee_default_vgpr64_sgpr102.num_vgpr, 53
; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_agpr, 0
; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
define internal void @aliasee_default_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()

View File

@ -9,8 +9,12 @@
; The parent kernel has a higher VGPR usage than the possible callees.
; CHECK-LABEL: {{^}}kernel1:
; CHECK: .amdhsa_next_free_vgpr 41
; CHECK-NEXT: .amdhsa_next_free_sgpr 33
; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel1.num_agpr, kernel1.num_vgpr), 1, 0)
; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.numbered_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1))
; CHECK: .set kernel1.num_vgpr, max(41, aliasee_vgpr32_sgpr76.num_vgpr)
; CHECK-NEXT: .set kernel1.num_agpr, max(0, aliasee_vgpr32_sgpr76.num_agpr)
; CHECK-NEXT: .set kernel1.numbered_sgpr, max(33, aliasee_vgpr32_sgpr76.numbered_sgpr)
define amdgpu_kernel void @kernel1() #0 {
bb:
call void asm sideeffect "; clobber v40 ", "~{v40}"()
@ -18,6 +22,9 @@ bb:
ret void
}
; CHECK: .set aliasee_vgpr32_sgpr76.num_vgpr, 27
; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_agpr, 0
; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.numbered_sgpr, 32
define internal void @aliasee_vgpr32_sgpr76() #1 {
bb:
call void asm sideeffect "; clobber v26 ", "~{v26}"()

View File

@ -7,14 +7,21 @@
@alias2 = hidden alias void (), ptr @aliasee_vgpr64_sgpr102
; CHECK-LABEL: {{^}}kernel2:
; CHECK: .amdhsa_next_free_vgpr 53
; CHECK-NEXT: .amdhsa_next_free_sgpr 33
; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel2.num_agpr, kernel2.num_vgpr), 1, 0)
; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.numbered_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1))
; CHECK: .set kernel2.num_vgpr, max(32, aliasee_vgpr64_sgpr102.num_vgpr)
; CHECK-NEXT: .set kernel2.num_agpr, max(0, aliasee_vgpr64_sgpr102.num_agpr)
; CHECK-NEXT: .set kernel2.numbered_sgpr, max(33, aliasee_vgpr64_sgpr102.numbered_sgpr)
define amdgpu_kernel void @kernel2() #0 {
bb:
call void @alias2() #2
ret void
}
; CHECK: .set aliasee_vgpr64_sgpr102.num_vgpr, 53
; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_agpr, 0
; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.numbered_sgpr, 32
define internal void @aliasee_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()

View File

@ -7,14 +7,21 @@
@alias3 = hidden alias void (), ptr @aliasee_vgpr256_sgpr102
; CHECK-LABEL: {{^}}kernel3:
; CHECK: .amdhsa_next_free_vgpr 253
; CHECK-NEXT: .amdhsa_next_free_sgpr 33
; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel3.num_agpr, kernel3.num_vgpr), 1, 0)
; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.numbered_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1))
; CHECK: .set kernel3.num_vgpr, max(32, aliasee_vgpr256_sgpr102.num_vgpr)
; CHECK-NEXT: .set kernel3.num_agpr, max(0, aliasee_vgpr256_sgpr102.num_agpr)
; CHECK-NEXT: .set kernel3.numbered_sgpr, max(33, aliasee_vgpr256_sgpr102.numbered_sgpr)
define amdgpu_kernel void @kernel3() #0 {
bb:
call void @alias3() #2
ret void
}
; CHECK: .set aliasee_vgpr256_sgpr102.num_vgpr, 253
; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_agpr, 0
; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.numbered_sgpr, 33
define internal void @aliasee_vgpr256_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v252 ", "~{v252}"()

View File

@ -7,7 +7,7 @@
; Make sure to run a GPU with the SGPR allocation bug.
; GCN-LABEL: {{^}}use_vcc:
; GCN: ; NumSgprs: 34
; GCN: ; TotalNumSgprs: 34
; GCN: ; NumVgprs: 0
define void @use_vcc() #1 {
call void asm sideeffect "", "~{vcc}" () #0
@ -25,7 +25,7 @@ define void @use_vcc() #1 {
; GCN: v_readlane_b32 s4, v40, 2
; GCN: s_mov_b32 s33, s4
; GCN: s_setpc_b64 s[30:31]
; GCN: ; NumSgprs: 36
; GCN: ; TotalNumSgprs: 36
; GCN: ; NumVgprs: 41
define void @indirect_use_vcc() #1 {
call void @use_vcc()
@ -33,9 +33,9 @@ define void @indirect_use_vcc() #1 {
}
; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
; CI: ; NumSgprs: 38
; VI-NOBUG: ; NumSgprs: 40
; VI-BUG: ; NumSgprs: 96
; CI: ; TotalNumSgprs: 38
; VI-NOBUG: ; TotalNumSgprs: 40
; VI-BUG: ; TotalNumSgprs: 96
; GCN: ; NumVgprs: 41
define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
call void @indirect_use_vcc()
@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
}
; GCN-LABEL: {{^}}use_flat_scratch:
; CI: ; NumSgprs: 36
; VI: ; NumSgprs: 38
; CI: ; TotalNumSgprs: 36
; VI: ; TotalNumSgprs: 38
; GCN: ; NumVgprs: 0
define void @use_flat_scratch() #1 {
call void asm sideeffect "", "~{flat_scratch}" () #0
@ -52,8 +52,8 @@ define void @use_flat_scratch() #1 {
}
; GCN-LABEL: {{^}}indirect_use_flat_scratch:
; CI: ; NumSgprs: 38
; VI: ; NumSgprs: 40
; CI: ; TotalNumSgprs: 38
; VI: ; TotalNumSgprs: 40
; GCN: ; NumVgprs: 41
define void @indirect_use_flat_scratch() #1 {
call void @use_flat_scratch()
@ -61,9 +61,9 @@ define void @indirect_use_flat_scratch() #1 {
}
; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
; CI: ; NumSgprs: 38
; VI-NOBUG: ; NumSgprs: 40
; VI-BUG: ; NumSgprs: 96
; CI: ; TotalNumSgprs: 38
; VI-NOBUG: ; TotalNumSgprs: 40
; VI-BUG: ; TotalNumSgprs: 96
; GCN: ; NumVgprs: 41
define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
call void @indirect_use_flat_scratch()
@ -107,23 +107,23 @@ define void @indirect_use_50_vgpr() #0 {
}
; GCN-LABEL: {{^}}use_80_sgpr:
; GCN: ; NumSgprs: 80
; GCN: ; TotalNumSgprs: 80
define void @use_80_sgpr() #1 {
call void asm sideeffect "", "~{s79}"() #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_80_sgpr:
; GCN: ; NumSgprs: 82
; GCN: ; TotalNumSgprs: 82
define void @indirect_use_80_sgpr() #1 {
call void @use_80_sgpr()
ret void
}
; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
; CI: ; NumSgprs: 84
; VI-NOBUG: ; NumSgprs: 86
; VI-BUG: ; NumSgprs: 96
; CI: ; TotalNumSgprs: 84
; VI-NOBUG: ; TotalNumSgprs: 86
; VI-BUG: ; TotalNumSgprs: 96
define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
call void @indirect_use_80_sgpr()
ret void
@ -176,7 +176,7 @@ define amdgpu_kernel void @multi_call_use_use_stack() #0 {
declare void @external() #0
; GCN-LABEL: {{^}}usage_external:
; NumSgprs: 48
; TotalNumSgprs: 48
; NumVgprs: 24
; GCN: ScratchSize: 16384
;
@ -190,7 +190,7 @@ define amdgpu_kernel void @usage_external() #0 {
declare void @external_recurse() #2
; GCN-LABEL: {{^}}usage_external_recurse:
; NumSgprs: 48
; TotalNumSgprs: 48
; NumVgprs: 24
; GCN: ScratchSize: 16384
;
@ -234,10 +234,11 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; Make sure there's no assert when a sgpr96 is used.
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
; CI: NumSgprs: 84
; VI-NOBUG: NumSgprs: 86
; VI-BUG: NumSgprs: 96
; GCN: NumVgprs: 50
; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
; VI-BUG: TotalNumSgprs: 96
; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
define amdgpu_kernel void @count_use_sgpr96_external_call() {
entry:
tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
@ -248,10 +249,11 @@ entry:
; Make sure there's no assert when a sgpr160 is used.
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
; CI: NumSgprs: 84
; VI-NOBUG: NumSgprs: 86
; VI-BUG: NumSgprs: 96
; GCN: NumVgprs: 50
; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
; VI-BUG: TotalNumSgprs: 96
; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
define amdgpu_kernel void @count_use_sgpr160_external_call() {
entry:
tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@ -262,10 +264,11 @@ entry:
; Make sure there's no assert when a vgpr160 is used.
; GCN-LABEL: {{^}}count_use_vgpr160_external_call
; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
; CI: NumSgprs: 84
; VI-NOBUG: NumSgprs: 86
; VI-BUG: NumSgprs: 96
; GCN: NumVgprs: 50
; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(5, amdgpu.max_num_vgpr)
; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; CI: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+4
; VI-BUG: TotalNumSgprs: 96
; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
define amdgpu_kernel void @count_use_vgpr160_external_call() {
entry:
tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@ -273,6 +276,27 @@ entry:
ret void
}
; GCN: .set amdgpu.max_num_vgpr, 50
; GCN: .set amdgpu.max_num_agpr, 0
; GCN: .set amdgpu.max_num_sgpr, 80
; GCN-LABEL: amdhsa.kernels:
; GCN: .name: count_use_sgpr96_external_call
; CI: .sgpr_count: 84
; VI-NOBUG: .sgpr_count: 86
; VI-BUG: .sgpr_count: 96
; GCN: .vgpr_count: 50
; GCN: .name: count_use_sgpr160_external_call
; CI: .sgpr_count: 84
; VI-NOBUG: .sgpr_count: 86
; VI-BUG: .sgpr_count: 96
; GCN: .vgpr_count: 50
; GCN: .name: count_use_vgpr160_external_call
; CI: .sgpr_count: 84
; VI-NOBUG: .sgpr_count: 86
; VI-BUG: .sgpr_count: 96
; GCN: .vgpr_count: 50
attributes #0 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #2 = { nounwind noinline }

View File

@ -33,6 +33,7 @@ bb2:
; GCN-LABEL: {{^}}preserve_condition_undef_flag:
; GCN-NOT: vcc
; GCN: s_endpgm
define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
bb0:
%tmp = icmp sgt i32 %arg1, 4

View File

@ -1,8 +1,8 @@
; REQUIRES: asserts
; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
; RUN: not --crash llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
; CHECK: function must have been generated already
; CHECK-NOT: func
define internal i32 @func() {
ret i32 0

View File

@ -25,11 +25,11 @@
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; NumSgprs: 8
; VI-NOXNACK: ; NumSgprs: 8
; VI-XNACK: ; NumSgprs: 12
; GFX9-ARCH-FLAT: ; NumSgprs: 14
; GFX10-ARCH-FLAT: ; NumSgprs: 8
; CI: ; TotalNumSgprs: 8
; VI-NOXNACK: ; TotalNumSgprs: 8
; VI-XNACK: ; TotalNumSgprs: 12
; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
define amdgpu_kernel void @no_vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7}"()
@ -42,11 +42,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; NumSgprs: 10
; VI-NOXNACK: ; NumSgprs: 10
; VI-XNACK: ; NumSgprs: 12
; GFX9-ARCH-FLAT: ; NumSgprs: 14
; GFX10-ARCH-FLAT: ; NumSgprs: 10
; CI: ; TotalNumSgprs: 10
; VI-NOXNACK: ; TotalNumSgprs: 10
; VI-XNACK: ; TotalNumSgprs: 12
; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
define amdgpu_kernel void @vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc}"()
@ -59,11 +59,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; NumSgprs: 12
; VI-NOXNACK: ; NumSgprs: 14
; VI-XNACK: ; NumSgprs: 14
; GFX9-ARCH-FLAT: ; NumSgprs: 14
; GFX10-ARCH-FLAT: ; NumSgprs: 8
; CI: ; TotalNumSgprs: 12
; VI-NOXNACK: ; TotalNumSgprs: 14
; VI-XNACK: ; TotalNumSgprs: 14
; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
define amdgpu_kernel void @no_vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@ -76,11 +76,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; NumSgprs: 12
; VI-NOXNACK: ; NumSgprs: 14
; VI-XNACK: ; NumSgprs: 14
; GFX9-ARCH-FLAT: ; NumSgprs: 14
; GFX10-ARCH-FLAT: ; NumSgprs: 10
; CI: ; TotalNumSgprs: 12
; VI-NOXNACK: ; TotalNumSgprs: 14
; VI-XNACK: ; TotalNumSgprs: 14
; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
define amdgpu_kernel void @vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@ -96,11 +96,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
; VI-XNACK: NumSgprs: 6
; GFX9-ARCH-FLAT: ; NumSgprs: 6
; GFX10-ARCH-FLAT: ; NumSgprs: 0
; CI: TotalNumSgprs: 4
; VI-NOXNACK: TotalNumSgprs: 6
; VI-XNACK: TotalNumSgprs: 6
; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@ -113,11 +113,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
; VI-XNACK: NumSgprs: 6
; GFX9-ARCH-FLAT: ; NumSgprs: 6
; GFX10-ARCH-FLAT: ; NumSgprs: 0
; CI: TotalNumSgprs: 4
; VI-NOXNACK: TotalNumSgprs: 6
; VI-XNACK: TotalNumSgprs: 6
; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr_lo() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@ -130,11 +130,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
; VI-XNACK: NumSgprs: 6
; GFX9-ARCH-FLAT: ; NumSgprs: 6
; GFX10-ARCH-FLAT: ; NumSgprs: 0
; CI: TotalNumSgprs: 4
; VI-NOXNACK: TotalNumSgprs: 6
; VI-XNACK: TotalNumSgprs: 6
; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr_hi() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()

View File

@ -0,0 +1,531 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefix=GCN %s
; Functions that don't make calls should have constants as its resource usage as no resource information has to be propagated.
; GCN-LABEL: {{^}}use_vcc:
; GCN: .set use_vcc.num_vgpr, 0
; GCN: .set use_vcc.num_agpr, 0
; GCN: .set use_vcc.numbered_sgpr, 32
; GCN: .set use_vcc.private_seg_size, 0
; GCN: .set use_vcc.uses_vcc, 1
; GCN: .set use_vcc.uses_flat_scratch, 0
; GCN: .set use_vcc.has_dyn_sized_stack, 0
; GCN: .set use_vcc.has_recursion, 0
; GCN: .set use_vcc.has_indirect_call, 0
; GCN: TotalNumSgprs: 36
; GCN: NumVgprs: 0
; GCN: ScratchSize: 0
define void @use_vcc() #1 {
call void asm sideeffect "", "~{vcc}" () #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_vcc:
; GCN: .set indirect_use_vcc.num_vgpr, max(41, use_vcc.num_vgpr)
; GCN: .set indirect_use_vcc.num_agpr, max(0, use_vcc.num_agpr)
; GCN: .set indirect_use_vcc.numbered_sgpr, max(34, use_vcc.numbered_sgpr)
; GCN: .set indirect_use_vcc.private_seg_size, 16+(max(use_vcc.private_seg_size))
; GCN: .set indirect_use_vcc.uses_vcc, or(1, use_vcc.uses_vcc)
; GCN: .set indirect_use_vcc.uses_flat_scratch, or(0, use_vcc.uses_flat_scratch)
; GCN: .set indirect_use_vcc.has_dyn_sized_stack, or(0, use_vcc.has_dyn_sized_stack)
; GCN: .set indirect_use_vcc.has_recursion, or(0, use_vcc.has_recursion)
; GCN: .set indirect_use_vcc.has_indirect_call, or(0, use_vcc.has_indirect_call)
; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define void @indirect_use_vcc() #1 {
call void @use_vcc()
ret void
}
; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
; GCN: .set indirect_2level_use_vcc_kernel.num_vgpr, max(32, indirect_use_vcc.num_vgpr)
; GCN: .set indirect_2level_use_vcc_kernel.num_agpr, max(0, indirect_use_vcc.num_agpr)
; GCN: .set indirect_2level_use_vcc_kernel.numbered_sgpr, max(33, indirect_use_vcc.numbered_sgpr)
; GCN: .set indirect_2level_use_vcc_kernel.private_seg_size, 0+(max(indirect_use_vcc.private_seg_size))
; GCN: .set indirect_2level_use_vcc_kernel.uses_vcc, or(1, indirect_use_vcc.uses_vcc)
; GCN: .set indirect_2level_use_vcc_kernel.uses_flat_scratch, or(1, indirect_use_vcc.uses_flat_scratch)
; GCN: .set indirect_2level_use_vcc_kernel.has_dyn_sized_stack, or(0, indirect_use_vcc.has_dyn_sized_stack)
; GCN: .set indirect_2level_use_vcc_kernel.has_recursion, or(0, indirect_use_vcc.has_recursion)
; GCN: .set indirect_2level_use_vcc_kernel.has_indirect_call, or(0, indirect_use_vcc.has_indirect_call)
; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
call void @indirect_use_vcc()
ret void
}
; GCN-LABEL: {{^}}use_flat_scratch:
; GCN: .set use_flat_scratch.num_vgpr, 0
; GCN: .set use_flat_scratch.num_agpr, 0
; GCN: .set use_flat_scratch.numbered_sgpr, 32
; GCN: .set use_flat_scratch.private_seg_size, 0
; GCN: .set use_flat_scratch.uses_vcc, 0
; GCN: .set use_flat_scratch.uses_flat_scratch, 1
; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
; GCN: .set use_flat_scratch.has_recursion, 0
; GCN: .set use_flat_scratch.has_indirect_call, 0
; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 0
; GCN: ScratchSize: 0
define void @use_flat_scratch() #1 {
call void asm sideeffect "", "~{flat_scratch}" () #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_flat_scratch:
; GCN: .set indirect_use_flat_scratch.num_vgpr, max(41, use_flat_scratch.num_vgpr)
; GCN: .set indirect_use_flat_scratch.num_agpr, max(0, use_flat_scratch.num_agpr)
; GCN: .set indirect_use_flat_scratch.numbered_sgpr, max(34, use_flat_scratch.numbered_sgpr)
; GCN: .set indirect_use_flat_scratch.private_seg_size, 16+(max(use_flat_scratch.private_seg_size))
; GCN: .set indirect_use_flat_scratch.uses_vcc, or(1, use_flat_scratch.uses_vcc)
; GCN: .set indirect_use_flat_scratch.uses_flat_scratch, or(0, use_flat_scratch.uses_flat_scratch)
; GCN: .set indirect_use_flat_scratch.has_dyn_sized_stack, or(0, use_flat_scratch.has_dyn_sized_stack)
; GCN: .set indirect_use_flat_scratch.has_recursion, or(0, use_flat_scratch.has_recursion)
; GCN: .set indirect_use_flat_scratch.has_indirect_call, or(0, use_flat_scratch.has_indirect_call)
; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define void @indirect_use_flat_scratch() #1 {
call void @use_flat_scratch()
ret void
}
; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
; GCN: .set indirect_2level_use_flat_scratch_kernel.num_vgpr, max(32, indirect_use_flat_scratch.num_vgpr)
; GCN: .set indirect_2level_use_flat_scratch_kernel.num_agpr, max(0, indirect_use_flat_scratch.num_agpr)
; GCN: .set indirect_2level_use_flat_scratch_kernel.numbered_sgpr, max(33, indirect_use_flat_scratch.numbered_sgpr)
; GCN: .set indirect_2level_use_flat_scratch_kernel.private_seg_size, 0+(max(indirect_use_flat_scratch.private_seg_size))
; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_vcc, or(1, indirect_use_flat_scratch.uses_vcc)
; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_flat_scratch, or(1, indirect_use_flat_scratch.uses_flat_scratch)
; GCN: .set indirect_2level_use_flat_scratch_kernel.has_dyn_sized_stack, or(0, indirect_use_flat_scratch.has_dyn_sized_stack)
; GCN: .set indirect_2level_use_flat_scratch_kernel.has_recursion, or(0, indirect_use_flat_scratch.has_recursion)
; GCN: .set indirect_2level_use_flat_scratch_kernel.has_indirect_call, or(0, indirect_use_flat_scratch.has_indirect_call)
; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
call void @indirect_use_flat_scratch()
ret void
}
; GCN-LABEL: {{^}}use_10_vgpr:
; GCN: .set use_10_vgpr.num_vgpr, 10
; GCN: .set use_10_vgpr.num_agpr, 0
; GCN: .set use_10_vgpr.numbered_sgpr, 32
; GCN: .set use_10_vgpr.private_seg_size, 0
; GCN: .set use_10_vgpr.uses_vcc, 0
; GCN: .set use_10_vgpr.uses_flat_scratch, 0
; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
; GCN: .set use_10_vgpr.has_recursion, 0
; GCN: .set use_10_vgpr.has_indirect_call, 0
; GCN: TotalNumSgprs: 36
; GCN: NumVgprs: 10
; GCN: ScratchSize: 0
define void @use_10_vgpr() #1 {
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4}"() #0
call void asm sideeffect "", "~{v5},~{v6},~{v7},~{v8},~{v9}"() #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_10_vgpr:
; GCN: .set indirect_use_10_vgpr.num_vgpr, max(41, use_10_vgpr.num_vgpr)
; GCN: .set indirect_use_10_vgpr.num_agpr, max(0, use_10_vgpr.num_agpr)
; GCN: .set indirect_use_10_vgpr.numbered_sgpr, max(34, use_10_vgpr.numbered_sgpr)
; GCN: .set indirect_use_10_vgpr.private_seg_size, 16+(max(use_10_vgpr.private_seg_size))
; GCN: .set indirect_use_10_vgpr.uses_vcc, or(1, use_10_vgpr.uses_vcc)
; GCN: .set indirect_use_10_vgpr.uses_flat_scratch, or(0, use_10_vgpr.uses_flat_scratch)
; GCN: .set indirect_use_10_vgpr.has_dyn_sized_stack, or(0, use_10_vgpr.has_dyn_sized_stack)
; GCN: .set indirect_use_10_vgpr.has_recursion, or(0, use_10_vgpr.has_recursion)
; GCN: .set indirect_use_10_vgpr.has_indirect_call, or(0, use_10_vgpr.has_indirect_call)
; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define void @indirect_use_10_vgpr() #0 {
call void @use_10_vgpr()
ret void
}
; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
; GCN: .set indirect_2_level_use_10_vgpr.num_vgpr, max(32, indirect_use_10_vgpr.num_vgpr)
; GCN: .set indirect_2_level_use_10_vgpr.num_agpr, max(0, indirect_use_10_vgpr.num_agpr)
; GCN: .set indirect_2_level_use_10_vgpr.numbered_sgpr, max(33, indirect_use_10_vgpr.numbered_sgpr)
; GCN: .set indirect_2_level_use_10_vgpr.private_seg_size, 0+(max(indirect_use_10_vgpr.private_seg_size))
; GCN: .set indirect_2_level_use_10_vgpr.uses_vcc, or(1, indirect_use_10_vgpr.uses_vcc)
; GCN: .set indirect_2_level_use_10_vgpr.uses_flat_scratch, or(1, indirect_use_10_vgpr.uses_flat_scratch)
; GCN: .set indirect_2_level_use_10_vgpr.has_dyn_sized_stack, or(0, indirect_use_10_vgpr.has_dyn_sized_stack)
; GCN: .set indirect_2_level_use_10_vgpr.has_recursion, or(0, indirect_use_10_vgpr.has_recursion)
; GCN: .set indirect_2_level_use_10_vgpr.has_indirect_call, or(0, indirect_use_10_vgpr.has_indirect_call)
; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
call void @indirect_use_10_vgpr()
ret void
}
; GCN-LABEL: {{^}}use_50_vgpr:
; GCN: .set use_50_vgpr.num_vgpr, 50
; GCN: .set use_50_vgpr.num_agpr, 0
; GCN: .set use_50_vgpr.numbered_sgpr, 32
; GCN: .set use_50_vgpr.private_seg_size, 0
; GCN: .set use_50_vgpr.uses_vcc, 0
; GCN: .set use_50_vgpr.uses_flat_scratch, 0
; GCN: .set use_50_vgpr.has_dyn_sized_stack, 0
; GCN: .set use_50_vgpr.has_recursion, 0
; GCN: .set use_50_vgpr.has_indirect_call, 0
; GCN: TotalNumSgprs: 36
; GCN: NumVgprs: 50
; GCN: ScratchSize: 0
define void @use_50_vgpr() #1 {
call void asm sideeffect "", "~{v49}"() #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_50_vgpr:
; GCN: .set indirect_use_50_vgpr.num_vgpr, max(41, use_50_vgpr.num_vgpr)
; GCN: .set indirect_use_50_vgpr.num_agpr, max(0, use_50_vgpr.num_agpr)
; GCN: .set indirect_use_50_vgpr.numbered_sgpr, max(34, use_50_vgpr.numbered_sgpr)
; GCN: .set indirect_use_50_vgpr.private_seg_size, 16+(max(use_50_vgpr.private_seg_size))
; GCN: .set indirect_use_50_vgpr.uses_vcc, or(1, use_50_vgpr.uses_vcc)
; GCN: .set indirect_use_50_vgpr.uses_flat_scratch, or(0, use_50_vgpr.uses_flat_scratch)
; GCN: .set indirect_use_50_vgpr.has_dyn_sized_stack, or(0, use_50_vgpr.has_dyn_sized_stack)
; GCN: .set indirect_use_50_vgpr.has_recursion, or(0, use_50_vgpr.has_recursion)
; GCN: .set indirect_use_50_vgpr.has_indirect_call, or(0, use_50_vgpr.has_indirect_call)
; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 50
; GCN: ScratchSize: 16
define void @indirect_use_50_vgpr() #0 {
call void @use_50_vgpr()
ret void
}
; GCN-LABEL: {{^}}use_80_sgpr:
; GCN: .set use_80_sgpr.num_vgpr, 1
; GCN: .set use_80_sgpr.num_agpr, 0
; GCN: .set use_80_sgpr.numbered_sgpr, 80
; GCN: .set use_80_sgpr.private_seg_size, 8
; GCN: .set use_80_sgpr.uses_vcc, 0
; GCN: .set use_80_sgpr.uses_flat_scratch, 0
; GCN: .set use_80_sgpr.has_dyn_sized_stack, 0
; GCN: .set use_80_sgpr.has_recursion, 0
; GCN: .set use_80_sgpr.has_indirect_call, 0
; GCN: TotalNumSgprs: 84
; GCN: NumVgprs: 1
; GCN: ScratchSize: 8
define void @use_80_sgpr() #1 {
call void asm sideeffect "", "~{s79}"() #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_80_sgpr:
; GCN: .set indirect_use_80_sgpr.num_vgpr, max(41, use_80_sgpr.num_vgpr)
; GCN: .set indirect_use_80_sgpr.num_agpr, max(0, use_80_sgpr.num_agpr)
; GCN: .set indirect_use_80_sgpr.numbered_sgpr, max(34, use_80_sgpr.numbered_sgpr)
; GCN: .set indirect_use_80_sgpr.private_seg_size, 16+(max(use_80_sgpr.private_seg_size))
; GCN: .set indirect_use_80_sgpr.uses_vcc, or(1, use_80_sgpr.uses_vcc)
; GCN: .set indirect_use_80_sgpr.uses_flat_scratch, or(0, use_80_sgpr.uses_flat_scratch)
; GCN: .set indirect_use_80_sgpr.has_dyn_sized_stack, or(0, use_80_sgpr.has_dyn_sized_stack)
; GCN: .set indirect_use_80_sgpr.has_recursion, or(0, use_80_sgpr.has_recursion)
; GCN: .set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call)
; GCN: TotalNumSgprs: 84
; GCN: NumVgprs: 41
; GCN: ScratchSize: 24
define void @indirect_use_80_sgpr() #1 {
call void @use_80_sgpr()
ret void
}
; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
; GCN: .set indirect_2_level_use_80_sgpr.num_vgpr, max(32, indirect_use_80_sgpr.num_vgpr)
; GCN: .set indirect_2_level_use_80_sgpr.num_agpr, max(0, indirect_use_80_sgpr.num_agpr)
; GCN: .set indirect_2_level_use_80_sgpr.numbered_sgpr, max(33, indirect_use_80_sgpr.numbered_sgpr)
; GCN: .set indirect_2_level_use_80_sgpr.private_seg_size, 0+(max(indirect_use_80_sgpr.private_seg_size))
; GCN: .set indirect_2_level_use_80_sgpr.uses_vcc, or(1, indirect_use_80_sgpr.uses_vcc)
; GCN: .set indirect_2_level_use_80_sgpr.uses_flat_scratch, or(1, indirect_use_80_sgpr.uses_flat_scratch)
; GCN: .set indirect_2_level_use_80_sgpr.has_dyn_sized_stack, or(0, indirect_use_80_sgpr.has_dyn_sized_stack)
; GCN: .set indirect_2_level_use_80_sgpr.has_recursion, or(0, indirect_use_80_sgpr.has_recursion)
; GCN: .set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call)
; GCN: TotalNumSgprs: 86
; GCN: NumVgprs: 41
; GCN: ScratchSize: 24
define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
call void @indirect_use_80_sgpr()
ret void
}
; GCN-LABEL: {{^}}use_stack0:
; GCN: .set use_stack0.num_vgpr, 1
; GCN: .set use_stack0.num_agpr, 0
; GCN: .set use_stack0.numbered_sgpr, 33
; GCN: .set use_stack0.private_seg_size, 2052
; GCN: .set use_stack0.uses_vcc, 0
; GCN: .set use_stack0.uses_flat_scratch, 0
; GCN: .set use_stack0.has_dyn_sized_stack, 0
; GCN: .set use_stack0.has_recursion, 0
; GCN: .set use_stack0.has_indirect_call, 0
; GCN: TotalNumSgprs: 37
; GCN: NumVgprs: 1
; GCN: ScratchSize: 2052
define void @use_stack0() #1 {
%alloca = alloca [512 x i32], align 4, addrspace(5)
call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
ret void
}
; GCN-LABEL: {{^}}use_stack1:
; GCN: .set use_stack1.num_vgpr, 1
; GCN: .set use_stack1.num_agpr, 0
; GCN: .set use_stack1.numbered_sgpr, 33
; GCN: .set use_stack1.private_seg_size, 404
; GCN: .set use_stack1.uses_vcc, 0
; GCN: .set use_stack1.uses_flat_scratch, 0
; GCN: .set use_stack1.has_dyn_sized_stack, 0
; GCN: .set use_stack1.has_recursion, 0
; GCN: .set use_stack1.has_indirect_call, 0
; GCN: TotalNumSgprs: 37
; GCN: NumVgprs: 1
; GCN: ScratchSize: 404
define void @use_stack1() #1 {
%alloca = alloca [100 x i32], align 4, addrspace(5)
call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_stack:
; GCN: .set indirect_use_stack.num_vgpr, max(41, use_stack0.num_vgpr)
; GCN: .set indirect_use_stack.num_agpr, max(0, use_stack0.num_agpr)
; GCN: .set indirect_use_stack.numbered_sgpr, max(34, use_stack0.numbered_sgpr)
; GCN: .set indirect_use_stack.private_seg_size, 80+(max(use_stack0.private_seg_size))
; GCN: .set indirect_use_stack.uses_vcc, or(1, use_stack0.uses_vcc)
; GCN: .set indirect_use_stack.uses_flat_scratch, or(0, use_stack0.uses_flat_scratch)
; GCN: .set indirect_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack)
; GCN: .set indirect_use_stack.has_recursion, or(0, use_stack0.has_recursion)
; GCN: .set indirect_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call)
; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 41
; GCN: ScratchSize: 2132
define void @indirect_use_stack() #1 {
%alloca = alloca [16 x i32], align 4, addrspace(5)
call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
call void @use_stack0()
ret void
}
; GCN-LABEL: {{^}}indirect_2_level_use_stack:
; GCN: .set indirect_2_level_use_stack.num_vgpr, max(32, indirect_use_stack.num_vgpr)
; GCN: .set indirect_2_level_use_stack.num_agpr, max(0, indirect_use_stack.num_agpr)
; GCN: .set indirect_2_level_use_stack.numbered_sgpr, max(33, indirect_use_stack.numbered_sgpr)
; GCN: .set indirect_2_level_use_stack.private_seg_size, 0+(max(indirect_use_stack.private_seg_size))
; GCN: .set indirect_2_level_use_stack.uses_vcc, or(1, indirect_use_stack.uses_vcc)
; GCN: .set indirect_2_level_use_stack.uses_flat_scratch, or(1, indirect_use_stack.uses_flat_scratch)
; GCN: .set indirect_2_level_use_stack.has_dyn_sized_stack, or(0, indirect_use_stack.has_dyn_sized_stack)
; GCN: .set indirect_2_level_use_stack.has_recursion, or(0, indirect_use_stack.has_recursion)
; GCN: .set indirect_2_level_use_stack.has_indirect_call, or(0, indirect_use_stack.has_indirect_call)
; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 2132
define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
call void @indirect_use_stack()
ret void
}
; Should be maximum of callee usage
; GCN-LABEL: {{^}}multi_call_use_use_stack:
; GCN: .set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr)
; GCN: .set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr)
; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(42, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr)
; GCN: .set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size))
; GCN: .set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc)
; GCN: .set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch)
; GCN: .set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack)
; GCN: .set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion)
; GCN: .set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call)
; GCN: TotalNumSgprs: 48
; GCN: NumVgprs: 41
; GCN: ScratchSize: 2052
define amdgpu_kernel void @multi_call_use_use_stack() #0 {
call void @use_stack0()
call void @use_stack1()
ret void
}
declare void @external() #0
; GCN-LABEL: {{^}}multi_call_with_external:
; GCN: .set multi_call_with_external.num_vgpr, max(41, amdgpu.max_num_vgpr)
; GCN: .set multi_call_with_external.num_agpr, max(0, amdgpu.max_num_agpr)
; GCN: .set multi_call_with_external.numbered_sgpr, max(42, amdgpu.max_num_sgpr)
; GCN: .set multi_call_with_external.private_seg_size, 0
; GCN: .set multi_call_with_external.uses_vcc, 1
; GCN: .set multi_call_with_external.uses_flat_scratch, 1
; GCN: .set multi_call_with_external.has_dyn_sized_stack, 1
; GCN: .set multi_call_with_external.has_recursion, 0
; GCN: .set multi_call_with_external.has_indirect_call, 1
; GCN: TotalNumSgprs: multi_call_with_external.numbered_sgpr+6
; GCN: NumVgprs: multi_call_with_external.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @multi_call_with_external() #0 {
call void @use_stack0()
call void @use_stack1()
call void @external()
ret void
}
; GCN-LABEL: {{^}}usage_external:
; GCN: .set usage_external.num_vgpr, max(32, amdgpu.max_num_vgpr)
; GCN: .set usage_external.num_agpr, max(0, amdgpu.max_num_agpr)
; GCN: .set usage_external.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; GCN: .set usage_external.private_seg_size, 0
; GCN: .set usage_external.uses_vcc, 1
; GCN: .set usage_external.uses_flat_scratch, 1
; GCN: .set usage_external.has_dyn_sized_stack, 1
; GCN: .set usage_external.has_recursion, 0
; GCN: .set usage_external.has_indirect_call, 1
; GCN: TotalNumSgprs: usage_external.numbered_sgpr+6
; GCN: NumVgprs: usage_external.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @usage_external() #0 {
call void @external()
ret void
}
declare void @external_recurse() #2
; GCN-LABEL: {{^}}usage_external_recurse:
; GCN: .set usage_external_recurse.num_vgpr, max(32, amdgpu.max_num_vgpr)
; GCN: .set usage_external_recurse.num_agpr, max(0, amdgpu.max_num_agpr)
; GCN: .set usage_external_recurse.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; GCN: .set usage_external_recurse.private_seg_size, 0
; GCN: .set usage_external_recurse.uses_vcc, 1
; GCN: .set usage_external_recurse.uses_flat_scratch, 1
; GCN: .set usage_external_recurse.has_dyn_sized_stack, 1
; GCN: .set usage_external_recurse.has_recursion, 1
; GCN: .set usage_external_recurse.has_indirect_call, 1
; GCN: TotalNumSgprs: usage_external_recurse.numbered_sgpr+6
; GCN: NumVgprs: usage_external_recurse.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @usage_external_recurse() #0 {
call void @external_recurse()
ret void
}
; GCN-LABEL: {{^}}direct_recursion_use_stack:
; GCN: .set direct_recursion_use_stack.num_vgpr, 41
; GCN: .set direct_recursion_use_stack.num_agpr, 0
; GCN: .set direct_recursion_use_stack.numbered_sgpr, 36
; GCN: .set direct_recursion_use_stack.private_seg_size, 2064
; GCN: .set direct_recursion_use_stack.uses_vcc, 1
; GCN: .set direct_recursion_use_stack.uses_flat_scratch, 0
; GCN: .set direct_recursion_use_stack.has_dyn_sized_stack, 0
; GCN: .set direct_recursion_use_stack.has_recursion, 1
; GCN: .set direct_recursion_use_stack.has_indirect_call, 0
; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 2064
define void @direct_recursion_use_stack(i32 %val) #2 {
%alloca = alloca [512 x i32], align 4, addrspace(5)
call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
%cmp = icmp eq i32 %val, 0
br i1 %cmp, label %ret, label %call
call:
%val.sub1 = sub i32 %val, 1
call void @direct_recursion_use_stack(i32 %val.sub1)
br label %ret
ret:
ret void
}
; GCN-LABEL: {{^}}usage_direct_recursion:
; GCN: .set usage_direct_recursion.num_vgpr, max(32, direct_recursion_use_stack.num_vgpr)
; GCN: .set usage_direct_recursion.num_agpr, max(0, direct_recursion_use_stack.num_agpr)
; GCN: .set usage_direct_recursion.numbered_sgpr, max(33, direct_recursion_use_stack.numbered_sgpr)
; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size))
; GCN: .set usage_direct_recursion.uses_vcc, or(1, direct_recursion_use_stack.uses_vcc)
; GCN: .set usage_direct_recursion.uses_flat_scratch, or(1, direct_recursion_use_stack.uses_flat_scratch)
; GCN: .set usage_direct_recursion.has_dyn_sized_stack, or(0, direct_recursion_use_stack.has_dyn_sized_stack)
; GCN: .set usage_direct_recursion.has_recursion, or(1, direct_recursion_use_stack.has_recursion)
; GCN: .set usage_direct_recursion.has_indirect_call, or(0, direct_recursion_use_stack.has_indirect_call)
; GCN: TotalNumSgprs: 42
; GCN: NumVgprs: 41
; GCN: ScratchSize: 2064
define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
call void @direct_recursion_use_stack(i32 %n)
ret void
}
; Make sure there's no assert when a sgpr96 is used.
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr)
; GCN: .set count_use_sgpr96_external_call.num_agpr, max(0, amdgpu.max_num_agpr)
; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; GCN: .set count_use_sgpr96_external_call.private_seg_size, 0
; GCN: .set count_use_sgpr96_external_call.uses_vcc, 1
; GCN: .set count_use_sgpr96_external_call.uses_flat_scratch, 1
; GCN: .set count_use_sgpr96_external_call.has_dyn_sized_stack, 1
; GCN: .set count_use_sgpr96_external_call.has_recursion, 0
; GCN: .set count_use_sgpr96_external_call.has_indirect_call, 1
; GCN: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+6
; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @count_use_sgpr96_external_call() {
entry:
tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
call void @external()
ret void
}
; Make sure there's no assert when a sgpr160 is used.
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr)
; GCN: .set count_use_sgpr160_external_call.num_agpr, max(0, amdgpu.max_num_agpr)
; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; GCN: .set count_use_sgpr160_external_call.private_seg_size, 0
; GCN: .set count_use_sgpr160_external_call.uses_vcc, 1
; GCN: .set count_use_sgpr160_external_call.uses_flat_scratch, 1
; GCN: .set count_use_sgpr160_external_call.has_dyn_sized_stack, 1
; GCN: .set count_use_sgpr160_external_call.has_recursion, 0
; GCN: .set count_use_sgpr160_external_call.has_indirect_call, 1
; GCN: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+6
; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @count_use_sgpr160_external_call() {
entry:
tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
call void @external()
ret void
}
; Make sure there's no assert when a vgpr160 is used.
; GCN-LABEL: {{^}}count_use_vgpr160_external_call
; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(32, amdgpu.max_num_vgpr)
; GCN: .set count_use_vgpr160_external_call.num_agpr, max(0, amdgpu.max_num_agpr)
; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
; GCN: .set count_use_vgpr160_external_call.private_seg_size, 0
; GCN: .set count_use_vgpr160_external_call.uses_vcc, 1
; GCN: .set count_use_vgpr160_external_call.uses_flat_scratch, 1
; GCN: .set count_use_vgpr160_external_call.has_dyn_sized_stack, 1
; GCN: .set count_use_vgpr160_external_call.has_recursion, 0
; GCN: .set count_use_vgpr160_external_call.has_indirect_call, 1
; GCN: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+6
; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @count_use_vgpr160_external_call() {
entry:
tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
call void @external()
ret void
}
; Added at the of the .s are the module level maximums
; GCN: .set amdgpu.max_num_vgpr, 50
; GCN: .set amdgpu.max_num_agpr, 0
; GCN: .set amdgpu.max_num_sgpr, 80
attributes #0 = { nounwind noinline norecurse }
attributes #1 = { nounwind noinline norecurse }
attributes #2 = { nounwind noinline }

View File

@ -41,7 +41,7 @@ entry:
}
; FIXME: This should warn too
; ERR-NOT: warning
; ERR-NOT: warning: inline asm clobber list contains reserved registers
define amdgpu_kernel void @def_exec(ptr addrspace(1) %ptr) {
entry:
%exec = call i64 asm sideeffect "; def $0", "={exec}"()

View File

@ -3,6 +3,18 @@
declare i32 @llvm.amdgcn.workitem.id.x()
define <2 x i64> @f1() #0 {
; GFX11-LABEL: f1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
ret <2 x i64> zeroinitializer
}
define void @f0() {
; GFX11-LABEL: f0:
; GFX11: ; %bb.0: ; %bb
@ -36,18 +48,6 @@ bb:
ret void
}
define <2 x i64> @f1() #0 {
; GFX11-LABEL: f1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_setpc_b64 s[30:31]
ret <2 x i64> zeroinitializer
}
; FIXME: This generates "instid1(/* invalid instid value */)".
define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
; GFX11-LABEL: f2:

View File

@ -30,7 +30,7 @@ define hidden void @func() #1 {
; GCN-NOT: writelane
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
; GCN: ; NumSgprs: 37
; GCN: ; TotalNumSgprs: 37
; GCN: ; NumVgprs: 9
define amdgpu_kernel void @kernel_call() #0 {
%vgpr = load volatile i32, ptr addrspace(1) undef
@ -48,7 +48,7 @@ define amdgpu_kernel void @kernel_call() #0 {
; GCN-NOT: readlane
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
; GCN: ; NumSgprs: 34
; GCN: ; TotalNumSgprs: 34
; GCN: ; NumVgprs: 10
define void @func_regular_call() #1 {
%vgpr = load volatile i32, ptr addrspace(1) undef
@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
; GCN-NEXT: s_addc_u32 s17,
; GCN-NEXT: s_setpc_b64 s[16:17]
; GCN: ; NumSgprs: 32
; GCN: ; TotalNumSgprs: 32
; GCN: ; NumVgprs: 8
define void @func_tail_call() #1 {
tail call void @func()
@ -77,7 +77,7 @@ define void @func_tail_call() #1 {
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
; GCN: s_setpc_b64
; GCN: ; NumSgprs: 34
; GCN: ; TotalNumSgprs: 34
; GCN: ; NumVgprs: 10
define void @func_call_tail_call() #1 {
%vgpr = load volatile i32, ptr addrspace(1) undef
@ -105,13 +105,6 @@ define void @test_funcx2() #0 {
ret void
}
; GCN-LABEL: {{^}}wombat:
define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
bb:
call void @hoge() #0
ret void
}
; Make sure we save/restore the return address around the call.
; Function Attrs: norecurse
define internal void @hoge() #2 {
@ -128,6 +121,13 @@ bb:
ret void
}
; GCN-LABEL: {{^}}wombat:
define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
bb:
call void @hoge() #0
ret void
}
declare dso_local void @eggs()

View File

@ -149,12 +149,9 @@
; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O0-NEXT: Machine Optimization Remark Emitter
; GCN-O0-NEXT: Stack Frame Layout Analysis
; GCN-O0-NEXT: Function register usage analysis
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O0-NEXT: Machine Optimization Remark Emitter
; GCN-O0-NEXT: AMDGPU Assembly Printer
; GCN-O0-NEXT: Free MachineFunction
; GCN-O0-NEXT: Function register usage analysis
; GCN-O0-NEXT: AMDGPU Assembly Printer
; GCN-O0-NEXT: Free MachineFunction
; GCN-O1:Target Library Information
; GCN-O1-NEXT:Target Pass Configuration
@ -427,12 +424,9 @@
; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O1-NEXT: Machine Optimization Remark Emitter
; GCN-O1-NEXT: Stack Frame Layout Analysis
; GCN-O1-NEXT: Function register usage analysis
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O1-NEXT: Machine Optimization Remark Emitter
; GCN-O1-NEXT: AMDGPU Assembly Printer
; GCN-O1-NEXT: Free MachineFunction
; GCN-O1-NEXT: Function register usage analysis
; GCN-O1-NEXT: AMDGPU Assembly Printer
; GCN-O1-NEXT: Free MachineFunction
; GCN-O1-OPTS:Target Library Information
; GCN-O1-OPTS-NEXT:Target Pass Configuration
@ -733,12 +727,9 @@
; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
; GCN-O1-OPTS-NEXT: Stack Frame Layout Analysis
; GCN-O1-OPTS-NEXT: Function register usage analysis
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer
; GCN-O1-OPTS-NEXT: Free MachineFunction
; GCN-O1-OPTS-NEXT: Function register usage analysis
; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer
; GCN-O1-OPTS-NEXT: Free MachineFunction
; GCN-O2:Target Library Information
; GCN-O2-NEXT:Target Pass Configuration
@ -1045,12 +1036,9 @@
; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O2-NEXT: Machine Optimization Remark Emitter
; GCN-O2-NEXT: Stack Frame Layout Analysis
; GCN-O2-NEXT: Function register usage analysis
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O2-NEXT: Machine Optimization Remark Emitter
; GCN-O2-NEXT: AMDGPU Assembly Printer
; GCN-O2-NEXT: Free MachineFunction
; GCN-O2-NEXT: Function register usage analysis
; GCN-O2-NEXT: AMDGPU Assembly Printer
; GCN-O2-NEXT: Free MachineFunction
; GCN-O3:Target Library Information
; GCN-O3-NEXT:Target Pass Configuration
@ -1369,12 +1357,9 @@
; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O3-NEXT: Machine Optimization Remark Emitter
; GCN-O3-NEXT: Stack Frame Layout Analysis
; GCN-O3-NEXT: Function register usage analysis
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O3-NEXT: Machine Optimization Remark Emitter
; GCN-O3-NEXT: AMDGPU Assembly Printer
; GCN-O3-NEXT: Free MachineFunction
; GCN-O3-NEXT: Function register usage analysis
; GCN-O3-NEXT: AMDGPU Assembly Printer
; GCN-O3-NEXT: Free MachineFunction
define void @empty() {
ret void

View File

@ -9,6 +9,19 @@
@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
; GCN-LABEL: {{^}}f0:
; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
; GCN: ds_write_b8 [[NULL]], [[TREE]]
define void @f0() {
; OPT-LABEL: @f0() {
; OPT-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
; OPT-NEXT: ret void
;
store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
ret void
}
; GCN-LABEL: {{^}}k0:
; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
@ -29,16 +42,3 @@ define amdgpu_kernel void @k0() {
call void @f0()
ret void
}
; GCN-LABEL: {{^}}f0:
; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
; GCN: ds_write_b8 [[NULL]], [[TREE]]
define void @f0() {
; OPT-LABEL: @f0() {
; OPT-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
; OPT-NEXT: ret void
;
store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
ret void
}

View File

@ -24,6 +24,55 @@ store i32 0, ptr addrspace(3) @used_by_kernel
}
; CHECK: ; LDSByteSize: 4 bytes
define void @nonkernel() {
; GFX9-LABEL: nonkernel:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
; GFX9-NEXT: ds_write_b64 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: nonkernel:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
; GFX10-NEXT: ds_write_b64 v0, v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; G_GFX9-LABEL: nonkernel:
; G_GFX9: ; %bb.0:
; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; G_GFX9-NEXT: v_mov_b32_e32 v2, 0
; G_GFX9-NEXT: v_mov_b32_e32 v3, 8
; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
; G_GFX9-NEXT: v_mov_b32_e32 v1, 0
; G_GFX9-NEXT: ds_write_b32 v3, v2
; G_GFX9-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; G_GFX10-LABEL: nonkernel:
; G_GFX10: ; %bb.0:
; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
; G_GFX10-NEXT: v_mov_b32_e32 v3, 8
; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
; G_GFX10-NEXT: ds_write_b32 v3, v2
; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: s_setpc_b64 s[30:31]
store i32 0, ptr addrspace(3) @used_by_both
store double 0.0, ptr addrspace(3) @used_by_function
ret void
}
; Needs to allocate both variables, store to used_by_both is at sizeof(double)
define amdgpu_kernel void @withcall() {
; GFX9-LABEL: withcall:
@ -171,55 +220,5 @@ define amdgpu_kernel void @nocall_false_sharing() {
}
; CHECK: ; LDSByteSize: 4 bytes
define void @nonkernel() {
; GFX9-LABEL: nonkernel:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
; GFX9-NEXT: ds_write_b64 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: nonkernel:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
; GFX10-NEXT: ds_write_b64 v0, v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; G_GFX9-LABEL: nonkernel:
; G_GFX9: ; %bb.0:
; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; G_GFX9-NEXT: v_mov_b32_e32 v2, 0
; G_GFX9-NEXT: v_mov_b32_e32 v3, 8
; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
; G_GFX9-NEXT: v_mov_b32_e32 v1, 0
; G_GFX9-NEXT: ds_write_b32 v3, v2
; G_GFX9-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; G_GFX10-LABEL: nonkernel:
; G_GFX10: ; %bb.0:
; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
; G_GFX10-NEXT: v_mov_b32_e32 v3, 8
; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
; G_GFX10-NEXT: ds_write_b32 v3, v2
; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT: s_setpc_b64 s[30:31]
store i32 0, ptr addrspace(3) @used_by_both
store double 0.0, ptr addrspace(3) @used_by_function
ret void
}
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}

View File

@ -1,7 +1,7 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s
; CHECK-LABEL: {{^}}_amdgpu_cs_main:
; CHECK: ; NumSgprs: 4
; CHECK: ; TotalNumSgprs: 4
; CHECK: ; NumVgprs: 2
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---

View File

@ -3,7 +3,11 @@
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=V5 %s
; CHECK-LABEL: {{^}}recursive:
; CHECK: .set recursive.private_seg_size, 16+(max(16384))
; CHECK: ScratchSize: 16
; V5-LABEL: {{^}}recursive:
; V5: .set recursive.has_recursion, 1
define void @recursive() {
call void @recursive()
store volatile i32 0, ptr addrspace(1) undef
@ -11,18 +15,22 @@ define void @recursive() {
}
; CHECK-LABEL: {{^}}tail_recursive:
; CHECK: .set tail_recursive.private_seg_size, 0
; CHECK: ScratchSize: 0
define void @tail_recursive() {
tail call void @tail_recursive()
ret void
}
; CHECK: .set calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
define void @calls_tail_recursive() norecurse {
tail call void @tail_recursive()
ret void
}
; CHECK-LABEL: {{^}}tail_recursive_with_stack:
; CHECK: .set tail_recursive_with_stack.private_seg_size, 8
; CHECK: .set tail_recursive_with_stack.has_recursion, 1
define void @tail_recursive_with_stack() {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
@ -33,11 +41,11 @@ define void @tail_recursive_with_stack() {
; For an arbitrary recursive call, report a large number for unknown stack
; usage for code object v4 and older
; CHECK-LABEL: {{^}}calls_recursive:
; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}}
; CHECK: .set calls_recursive.private_seg_size, 0+(max(16384, recursive.private_seg_size))
;
; V5-LABEL: {{^}}calls_recursive:
; V5: .amdhsa_private_segment_fixed_size 0{{$}}
; V5: .amdhsa_uses_dynamic_stack 1
; V5: .set calls_recursive.private_seg_size, 0+(max(recursive.private_seg_size))
; V5: .set calls_recursive.has_dyn_sized_stack, or(0, recursive.has_dyn_sized_stack)
define amdgpu_kernel void @calls_recursive() {
call void @recursive()
ret void
@ -46,7 +54,7 @@ define amdgpu_kernel void @calls_recursive() {
; Make sure we do not report a huge stack size for tail recursive
; functions
; CHECK-LABEL: {{^}}kernel_indirectly_calls_tail_recursive:
; CHECK: .amdhsa_private_segment_fixed_size 0{{$}}
; CHECK: .set kernel_indirectly_calls_tail_recursive.private_seg_size, 0+(max(calls_tail_recursive.private_seg_size))
define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
call void @calls_tail_recursive()
ret void
@ -57,22 +65,22 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
; in the kernel.
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive:
; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(16384, tail_recursive.private_seg_size))
;
; V5-LABEL: {{^}}kernel_calls_tail_recursive:
; V5: .amdhsa_private_segment_fixed_size 0{{$}}
; V5: .amdhsa_uses_dynamic_stack 1
; V5: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
; V5: .set kernel_calls_tail_recursive.has_recursion, or(1, tail_recursive.has_recursion)
define amdgpu_kernel void @kernel_calls_tail_recursive() {
call void @tail_recursive()
ret void
}
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(16384, tail_recursive_with_stack.private_seg_size))
;
; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
; V5: .amdhsa_private_segment_fixed_size 8{{$}}
; V5: .amdhsa_uses_dynamic_stack 1
; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(tail_recursive_with_stack.private_seg_size))
; V5: .set kernel_calls_tail_recursive_with_stack.has_dyn_sized_stack, or(0, tail_recursive_with_stack.has_dyn_sized_stack)
define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() {
call void @tail_recursive_with_stack()
ret void

View File

@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
; SI-LABEL: {{^}}foo:
; SI: .section .AMDGPU.csdata
; SI: ; Kernel info:
; SI: ; NumSgprs: {{[0-9]+}}
; SI: ; TotalNumSgprs: {{[0-9]+}}
; SI: ; NumVgprs: {{[0-9]+}}
define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %abase, ptr addrspace(1) %bbase) nounwind {
%mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);

View File

@ -2,7 +2,7 @@
; RUN: FileCheck -check-prefix=REMARK %s < %t
; STDERR: remark: foo.cl:27:0: Function Name: test_kernel
; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 28
; STDERR-NEXT: remark: foo.cl:27:0: TotalSGPRs: 28
; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: 9
; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: 43
; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: 0
@ -27,7 +27,7 @@
; REMARK-NEXT: DebugLoc: { File: foo.cl, Line: 27, Column: 0 }
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' SGPRs: '
; REMARK-NEXT: - String: ' TotalSGPRs: '
; REMARK-NEXT: - NumSGPR: '28'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
@ -122,7 +122,7 @@ define void @test_func() !dbg !6 {
}
; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 4
; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 4
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
@ -141,12 +141,12 @@ define void @empty_func() !dbg !8 {
}
; STDERR: remark: foo.cl:64:0: Function Name: test_indirect_call
; STDERR-NEXT: remark: foo.cl:64:0: SGPRs: 39
; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: 32
; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: 10
; STDERR-NEXT: remark: foo.cl:64:0: TotalSGPRs: test_indirect_call.numbered_sgpr+6
; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: test_indirect_call.num_vgpr
; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr
; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0
; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True
; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: 8
; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: LDS Size [bytes/block]: 0
@ -159,12 +159,12 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 {
}
; STDERR: remark: foo.cl:74:0: Function Name: test_indirect_w_static_stack
; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: 39
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: 32
; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: 10
; STDERR-NEXT: remark: foo.cl:74:0: TotalSGPRs: test_indirect_w_static_stack.numbered_sgpr+6
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: test_indirect_w_static_stack.num_vgpr
; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr
; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144
; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True
; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: 8
; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0

View File

@ -1,6 +1,6 @@
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s
; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN,ALL %s
; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s
; Make sure there's no assertion when trying to report the resource
; usage for a function which becomes dead during codegen.
@ -21,9 +21,10 @@ define internal fastcc void @unreachable() {
; GCN-NOT: s_swappc_b64
; GCN: s_endpgm
; GCN: .amdhsa_private_segment_fixed_size 0
; GCN-NOT: .amdhsa_uses_dynamic_stack 0
; GCN-V5: .amdhsa_uses_dynamic_stack 0
; GCN-NOT: .amdhsa_uses_dynamic_stack
; GCN-V5: .amdhsa_uses_dynamic_stack
; ALL: .set entry.private_seg_size, 0
; ALL: .set entry.has_dyn_sized_stack, 0
define amdgpu_kernel void @entry() {
bb0:
br i1 false, label %bb1, label %bb2