410 lines
16 KiB
C++
410 lines
16 KiB
C++
//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// Implements the AMDGPU specific subclass of TargetSubtarget.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPUSubtarget.h"
|
|
#include "AMDGPUCallLowering.h"
|
|
#include "AMDGPUInstructionSelector.h"
|
|
#include "AMDGPULegalizerInfo.h"
|
|
#include "AMDGPURegisterBankInfo.h"
|
|
#include "R600Subtarget.h"
|
|
#include "SIMachineFunctionInfo.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
|
|
#include "llvm/CodeGen/MachineScheduler.h"
|
|
#include "llvm/CodeGen/TargetFrameLowering.h"
|
|
#include "llvm/IR/DiagnosticInfo.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/IR/IntrinsicsR600.h"
|
|
#include "llvm/IR/MDBuilder.h"
|
|
#include <algorithm>
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "amdgpu-subtarget"
|
|
|
|
AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {}
|
|
|
|
bool AMDGPUSubtarget::useRealTrue16Insts() const {
|
|
return hasTrue16BitInsts() && EnableRealTrue16Insts;
|
|
}
|
|
|
|
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
|
|
// allows the given function to achieve an occupancy of NWaves waves per
|
|
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
|
|
unsigned
|
|
AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
|
|
const Function &F) const {
|
|
const unsigned WaveSize = getWavefrontSize();
|
|
const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
|
|
const unsigned WavesPerWorkgroup =
|
|
std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
|
|
|
|
const unsigned WorkGroupsPerCU =
|
|
std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
|
|
|
|
return getLocalMemorySize() / WorkGroupsPerCU;
|
|
}
|
|
|
|
std::pair<unsigned, unsigned>
|
|
AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
|
|
const Function &F) const {
|
|
// FIXME: We should take into account the LDS allocation granularity.
|
|
const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
|
|
|
|
// Queried LDS size may be larger than available on a CU, in which case we
|
|
// consider the only achievable occupancy to be 1, in line with what we
|
|
// consider the occupancy to be when the number of requested registers in a
|
|
// particular bank is higher than the number of available ones in that bank.
|
|
if (!MaxWGsLDS)
|
|
return {1, 1};
|
|
|
|
const unsigned WaveSize = getWavefrontSize(), WavesPerEU = getMaxWavesPerEU();
|
|
|
|
auto PropsFromWGSize = [=](unsigned WGSize)
|
|
-> std::tuple<const unsigned, const unsigned, unsigned> {
|
|
unsigned WavesPerWG = divideCeil(WGSize, WaveSize);
|
|
unsigned WGsPerCU = std::min(getMaxWorkGroupsPerCU(WGSize), MaxWGsLDS);
|
|
return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
|
|
};
|
|
|
|
// The maximum group size will generally yield the minimum number of
|
|
// workgroups, maximum number of waves, and minimum occupancy. The opposite is
|
|
// generally true for the minimum group size. LDS or barrier ressource
|
|
// limitations can flip those minimums/maximums.
|
|
const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F);
|
|
auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
|
|
auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
|
|
|
|
// It is possible that we end up with flipped minimum and maximum number of
|
|
// waves per CU when the number of minimum/maximum concurrent groups on the CU
|
|
// is limited by LDS usage or barrier resources.
|
|
if (MinWavesPerCU >= MaxWavesPerCU) {
|
|
std::swap(MinWavesPerCU, MaxWavesPerCU);
|
|
} else {
|
|
const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
|
|
|
|
// Look for a potential smaller group size than the maximum which decreases
|
|
// the concurrent number of waves on the CU for the same number of
|
|
// concurrent workgroups on the CU.
|
|
unsigned MinWavesPerCUForWGSize =
|
|
divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;
|
|
if (MinWavesPerCU > MinWavesPerCUForWGSize) {
|
|
unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
|
|
if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
|
|
// There may exist a smaller group size than the maximum that achieves
|
|
// the minimum number of waves per CU. This group size is the largest
|
|
// possible size that requires MaxWavesPerWG - E waves where E is
|
|
// maximized under the following constraints.
|
|
// 1. 0 <= E <= ExcessSlotsPerWG
|
|
// 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize
|
|
MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,
|
|
MaxWavesPerWG - MinWavesPerWG);
|
|
}
|
|
}
|
|
|
|
// Look for a potential larger group size than the minimum which increases
|
|
// the concurrent number of waves on the CU for the same number of
|
|
// concurrent workgroups on the CU.
|
|
unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
|
|
if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
|
|
// There may exist a larger group size than the minimum that achieves the
|
|
// maximum number of waves per CU. This group size is the smallest
|
|
// possible size that requires MinWavesPerWG + L waves where L is
|
|
// maximized under the following constraints.
|
|
// 1. 0 <= L <= LeftoverSlotsPerWG
|
|
// 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize
|
|
MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,
|
|
((MaxWGSize - 1) / WaveSize) + 1 -
|
|
MinWavesPerWG);
|
|
}
|
|
}
|
|
|
|
// Return the minimum/maximum number of waves on any EU, assuming that all
|
|
// wavefronts are spread across all EUs as evenly as possible.
|
|
return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU),
|
|
std::clamp(divideCeil(MaxWavesPerCU, getEUsPerCU()), 1U, WavesPerEU)};
|
|
}
|
|
|
|
std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
|
|
const MachineFunction &MF) const {
|
|
const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
return getOccupancyWithWorkGroupSizes(MFI->getLDSSize(), MF.getFunction());
|
|
}
|
|
|
|
std::pair<unsigned, unsigned>
|
|
AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
|
|
switch (CC) {
|
|
case CallingConv::AMDGPU_VS:
|
|
case CallingConv::AMDGPU_LS:
|
|
case CallingConv::AMDGPU_HS:
|
|
case CallingConv::AMDGPU_ES:
|
|
case CallingConv::AMDGPU_GS:
|
|
case CallingConv::AMDGPU_PS:
|
|
return std::pair(1, getWavefrontSize());
|
|
default:
|
|
return std::pair(1u, getMaxFlatWorkGroupSize());
|
|
}
|
|
}
|
|
|
|
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
|
|
const Function &F) const {
|
|
// Default minimum/maximum flat work group sizes.
|
|
std::pair<unsigned, unsigned> Default =
|
|
getDefaultFlatWorkGroupSize(F.getCallingConv());
|
|
|
|
// Requested minimum/maximum flat work group sizes.
|
|
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
|
|
F, "amdgpu-flat-work-group-size", Default);
|
|
|
|
// Make sure requested minimum is less than requested maximum.
|
|
if (Requested.first > Requested.second)
|
|
return Default;
|
|
|
|
// Make sure requested values do not violate subtarget's specifications.
|
|
if (Requested.first < getMinFlatWorkGroupSize())
|
|
return Default;
|
|
if (Requested.second > getMaxFlatWorkGroupSize())
|
|
return Default;
|
|
|
|
return Requested;
|
|
}
|
|
|
|
std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
|
|
std::pair<unsigned, unsigned> Requested,
|
|
std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
|
|
// Default minimum/maximum number of waves per execution unit.
|
|
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
|
|
|
|
// If minimum/maximum flat work group sizes were explicitly requested using
|
|
// "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
|
|
// number of waves per execution unit to values implied by requested
|
|
// minimum/maximum flat work group sizes.
|
|
unsigned MinImpliedByFlatWorkGroupSize =
|
|
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
|
|
Default.first = MinImpliedByFlatWorkGroupSize;
|
|
|
|
// Make sure requested minimum is less than requested maximum.
|
|
if (Requested.second && Requested.first > Requested.second)
|
|
return Default;
|
|
|
|
// Make sure requested values do not violate subtarget's specifications.
|
|
if (Requested.first < getMinWavesPerEU() ||
|
|
Requested.second > getMaxWavesPerEU())
|
|
return Default;
|
|
|
|
// Make sure requested values are compatible with values implied by requested
|
|
// minimum/maximum flat work group sizes.
|
|
if (Requested.first < MinImpliedByFlatWorkGroupSize)
|
|
return Default;
|
|
|
|
return Requested;
|
|
}
|
|
|
|
std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
|
|
const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
|
|
// Default minimum/maximum number of waves per execution unit.
|
|
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
|
|
|
|
// Requested minimum/maximum number of waves per execution unit.
|
|
std::pair<unsigned, unsigned> Requested =
|
|
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
|
|
return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
|
|
}
|
|
|
|
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
|
|
auto *Node = Kernel.getMetadata("reqd_work_group_size");
|
|
if (Node && Node->getNumOperands() == 3)
|
|
return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
|
|
return std::numeric_limits<unsigned>::max();
|
|
}
|
|
|
|
bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
|
|
return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
|
|
}
|
|
|
|
unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
|
|
unsigned Dimension) const {
|
|
unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
|
|
if (ReqdSize != std::numeric_limits<unsigned>::max())
|
|
return ReqdSize - 1;
|
|
return getFlatWorkGroupSizes(Kernel).second - 1;
|
|
}
|
|
|
|
bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
|
|
for (int I = 0; I < 3; ++I) {
|
|
if (getMaxWorkitemID(Func, I) > 0)
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
|
|
Function *Kernel = I->getParent()->getParent();
|
|
unsigned MinSize = 0;
|
|
unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
|
|
bool IdQuery = false;
|
|
|
|
// If reqd_work_group_size is present it narrows value down.
|
|
if (auto *CI = dyn_cast<CallInst>(I)) {
|
|
const Function *F = CI->getCalledFunction();
|
|
if (F) {
|
|
unsigned Dim = UINT_MAX;
|
|
switch (F->getIntrinsicID()) {
|
|
case Intrinsic::amdgcn_workitem_id_x:
|
|
case Intrinsic::r600_read_tidig_x:
|
|
IdQuery = true;
|
|
[[fallthrough]];
|
|
case Intrinsic::r600_read_local_size_x:
|
|
Dim = 0;
|
|
break;
|
|
case Intrinsic::amdgcn_workitem_id_y:
|
|
case Intrinsic::r600_read_tidig_y:
|
|
IdQuery = true;
|
|
[[fallthrough]];
|
|
case Intrinsic::r600_read_local_size_y:
|
|
Dim = 1;
|
|
break;
|
|
case Intrinsic::amdgcn_workitem_id_z:
|
|
case Intrinsic::r600_read_tidig_z:
|
|
IdQuery = true;
|
|
[[fallthrough]];
|
|
case Intrinsic::r600_read_local_size_z:
|
|
Dim = 2;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (Dim <= 3) {
|
|
unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
|
|
if (ReqdSize != std::numeric_limits<unsigned>::max())
|
|
MinSize = MaxSize = ReqdSize;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!MaxSize)
|
|
return false;
|
|
|
|
// Range metadata is [Lo, Hi). For ID query we need to pass max size
|
|
// as Hi. For size query we need to pass Hi + 1.
|
|
if (IdQuery)
|
|
MinSize = 0;
|
|
else
|
|
++MaxSize;
|
|
|
|
APInt Lower{32, MinSize};
|
|
APInt Upper{32, MaxSize};
|
|
if (auto *CI = dyn_cast<CallBase>(I)) {
|
|
ConstantRange Range(Lower, Upper);
|
|
CI->addRangeRetAttr(Range);
|
|
} else {
|
|
MDBuilder MDB(I->getContext());
|
|
MDNode *MaxWorkGroupSizeRange = MDB.createRange(Lower, Upper);
|
|
I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
|
|
assert(AMDGPU::isKernel(F.getCallingConv()));
|
|
|
|
// We don't allocate the segment if we know the implicit arguments weren't
|
|
// used, even if the ABI implies we need them.
|
|
if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
|
|
return 0;
|
|
|
|
if (isMesaKernel(F))
|
|
return 16;
|
|
|
|
// Assume all implicit inputs are used by default
|
|
const Module *M = F.getParent();
|
|
unsigned NBytes =
|
|
AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
|
|
return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
|
|
NBytes);
|
|
}
|
|
|
|
uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
|
|
Align &MaxAlign) const {
|
|
assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
|
|
F.getCallingConv() == CallingConv::SPIR_KERNEL);
|
|
|
|
const DataLayout &DL = F.getDataLayout();
|
|
uint64_t ExplicitArgBytes = 0;
|
|
MaxAlign = Align(1);
|
|
|
|
for (const Argument &Arg : F.args()) {
|
|
if (Arg.hasAttribute("amdgpu-hidden-argument"))
|
|
continue;
|
|
|
|
const bool IsByRef = Arg.hasByRefAttr();
|
|
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
|
|
Align Alignment = DL.getValueOrABITypeAlignment(
|
|
IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
|
|
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
|
|
ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
|
|
MaxAlign = std::max(MaxAlign, Alignment);
|
|
}
|
|
|
|
return ExplicitArgBytes;
|
|
}
|
|
|
|
unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
|
|
Align &MaxAlign) const {
|
|
if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
|
|
F.getCallingConv() != CallingConv::SPIR_KERNEL)
|
|
return 0;
|
|
|
|
uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
|
|
|
|
unsigned ExplicitOffset = getExplicitKernelArgOffset();
|
|
|
|
uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
|
|
unsigned ImplicitBytes = getImplicitArgNumBytes(F);
|
|
if (ImplicitBytes != 0) {
|
|
const Align Alignment = getAlignmentForImplicitArgPtr();
|
|
TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
|
|
MaxAlign = std::max(MaxAlign, Alignment);
|
|
}
|
|
|
|
// Being able to dereference past the end is useful for emitting scalar loads.
|
|
return alignTo(TotalSize, 4);
|
|
}
|
|
|
|
AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
|
|
return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
|
|
: AMDGPUDwarfFlavour::Wave64;
|
|
}
|
|
|
|
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
|
|
if (MF.getTarget().getTargetTriple().isAMDGCN())
|
|
return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
|
|
return static_cast<const AMDGPUSubtarget &>(MF.getSubtarget<R600Subtarget>());
|
|
}
|
|
|
|
const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
|
|
if (TM.getTargetTriple().isAMDGCN())
|
|
return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
|
|
return static_cast<const AMDGPUSubtarget &>(
|
|
TM.getSubtarget<R600Subtarget>(F));
|
|
}
|
|
|
|
// FIXME: This has no reason to be in subtarget
|
|
SmallVector<unsigned>
|
|
AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
|
|
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3,
|
|
std::numeric_limits<uint32_t>::max());
|
|
}
|