llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

360 lines
12 KiB
C++

//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//==-----------------------------------------------------------------------===//
//
/// \file
/// Base class for AMDGPU specific classes of TargetSubtarget.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#include "llvm/IR/CallingConv.h"
#include "llvm/Support/Alignment.h"
#include "llvm/TargetParser/Triple.h"
namespace llvm {
enum AMDGPUDwarfFlavour : unsigned;
class Function;
class Instruction;
class MachineFunction;
class TargetMachine;
class AMDGPUSubtarget {
public:
enum Generation {
INVALID = 0,
R600 = 1,
R700 = 2,
EVERGREEN = 3,
NORTHERN_ISLANDS = 4,
SOUTHERN_ISLANDS = 5,
SEA_ISLANDS = 6,
VOLCANIC_ISLANDS = 7,
GFX9 = 8,
GFX10 = 9,
GFX11 = 10,
GFX12 = 11,
};
private:
Triple TargetTriple;
protected:
bool GCN3Encoding = false;
bool Has16BitInsts = false;
bool HasTrue16BitInsts = false;
bool HasFP8ConversionScaleInsts = false;
bool HasBF8ConversionScaleInsts = false;
bool HasFP4ConversionScaleInsts = false;
bool HasFP6BF6ConversionScaleInsts = false;
bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
bool HasBF16ConversionInsts = false;
bool HasMadMixInsts = false;
bool HasMadMacF32Insts = false;
bool HasDsSrc2Insts = false;
bool HasSDWA = false;
bool HasVOP3PInsts = false;
bool HasMulI24 = true;
bool HasMulU24 = true;
bool HasSMulHi = false;
bool HasInv2PiInlineImm = false;
bool HasFminFmaxLegacy = true;
bool EnablePromoteAlloca = false;
bool HasTrigReducedRange = false;
bool FastFMAF32 = false;
unsigned EUsPerCU = 4;
unsigned MaxWavesPerEU = 10;
unsigned LocalMemorySize = 0;
unsigned AddressableLocalMemorySize = 0;
char WavefrontSizeLog2 = 0;
public:
AMDGPUSubtarget(Triple TT);
static const AMDGPUSubtarget &get(const MachineFunction &MF);
static const AMDGPUSubtarget &get(const TargetMachine &TM,
const Function &F);
/// \returns Default range flat work group size for a calling convention.
std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
/// \returns Subtarget's default pair of minimum/maximum flat work group sizes
/// for function \p F, or minimum/maximum flat work group sizes explicitly
/// requested using "amdgpu-flat-work-group-size" attribute attached to
/// function \p F.
///
/// \returns Subtarget's default values if explicitly requested values cannot
/// be converted to integer, or violate subtarget's specifications.
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
/// \returns Subtarget's default pair of minimum/maximum number of waves per
/// execution unit for function \p F, or minimum/maximum number of waves per
/// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
/// attached to function \p F.
///
/// \returns Subtarget's default values if explicitly requested values cannot
/// be converted to integer, violate subtarget's specifications, or are not
/// compatible with minimum/maximum number of waves limited by flat work group
/// size, register usage, and/or lds usage.
std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const {
// Default/requested minimum/maximum flat work group sizes.
std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
return getWavesPerEU(F, FlatWorkGroupSizes);
}
/// Overload which uses the specified values for the flat work group sizes,
/// rather than querying the function itself. \p FlatWorkGroupSizes Should
/// correspond to the function's value for getFlatWorkGroupSizes.
std::pair<unsigned, unsigned>
getWavesPerEU(const Function &F,
std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
std::pair<unsigned, unsigned> getEffectiveWavesPerEU(
std::pair<unsigned, unsigned> WavesPerEU,
std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
/// Return the amount of LDS that can be used that will not restrict the
/// occupancy lower than WaveCount.
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
const Function &) const;
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p F and each
/// workgroup running the function requires \p LDSBytes bytes of LDS space.
/// This notably depends on the range of allowed flat group sizes for the
/// function and hardware characteristics.
std::pair<unsigned, unsigned>
getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p MF. This notably
/// depends on the range of allowed flat group sizes for the function, the
/// amount of per-workgroup LDS space required by the function, and hardware
/// characteristics.
std::pair<unsigned, unsigned>
getOccupancyWithWorkGroupSizes(const MachineFunction &MF) const;
bool isAmdHsaOS() const {
return TargetTriple.getOS() == Triple::AMDHSA;
}
bool isAmdPalOS() const {
return TargetTriple.getOS() == Triple::AMDPAL;
}
bool isMesa3DOS() const {
return TargetTriple.getOS() == Triple::Mesa3D;
}
bool isMesaKernel(const Function &F) const;
bool isAmdHsaOrMesa(const Function &F) const {
return isAmdHsaOS() || isMesaKernel(F);
}
bool isGCN() const { return TargetTriple.isAMDGCN(); }
bool isGCN3Encoding() const {
return GCN3Encoding;
}
bool has16BitInsts() const {
return Has16BitInsts;
}
/// Return true if the subtarget supports True16 instructions.
bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
/// Return true if real (non-fake) variants of True16 instructions using
/// 16-bit registers should be code-generated. Fake True16 instructions are
/// identical to non-fake ones except that they take 32-bit registers as
/// operands and always use their low halves.
// TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;
bool hasBF16ConversionInsts() const {
return HasBF16ConversionInsts;
}
bool hasMadMixInsts() const {
return HasMadMixInsts;
}
bool hasFP8ConversionScaleInsts() const { return HasFP8ConversionScaleInsts; }
bool hasBF8ConversionScaleInsts() const { return HasBF8ConversionScaleInsts; }
bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; }
bool hasFP6BF6ConversionScaleInsts() const { return HasFP6BF6ConversionScaleInsts; }
bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; }
bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
bool hasF32ToF16BF16ConversionSRInsts() const {
return HasF32ToF16BF16ConversionSRInsts;
}
bool hasMadMacF32Insts() const {
return HasMadMacF32Insts || !isGCN();
}
bool hasDsSrc2Insts() const {
return HasDsSrc2Insts;
}
bool hasSDWA() const {
return HasSDWA;
}
bool hasVOP3PInsts() const {
return HasVOP3PInsts;
}
bool hasMulI24() const {
return HasMulI24;
}
bool hasMulU24() const {
return HasMulU24;
}
bool hasSMulHi() const {
return HasSMulHi;
}
bool hasInv2PiInlineImm() const {
return HasInv2PiInlineImm;
}
bool hasFminFmaxLegacy() const {
return HasFminFmaxLegacy;
}
bool hasTrigReducedRange() const {
return HasTrigReducedRange;
}
bool hasFastFMAF32() const {
return FastFMAF32;
}
bool isPromoteAllocaEnabled() const {
return EnablePromoteAlloca;
}
unsigned getWavefrontSize() const {
return 1 << WavefrontSizeLog2;
}
unsigned getWavefrontSizeLog2() const {
return WavefrontSizeLog2;
}
/// Return the maximum number of bytes of LDS available for all workgroups
/// running on the same WGP or CU.
/// For GFX10-GFX12 in WGP mode this is 128k even though each workgroup is
/// limited to 64k.
unsigned getLocalMemorySize() const {
return LocalMemorySize;
}
/// Return the maximum number of bytes of LDS that can be allocated to a
/// single workgroup.
/// For GFX10-GFX12 in WGP mode this is limited to 64k even though the WGP has
/// 128k in total.
unsigned getAddressableLocalMemorySize() const {
return AddressableLocalMemorySize;
}
/// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
/// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
/// CU mode into account.
unsigned getEUsPerCU() const { return EUsPerCU; }
Align getAlignmentForImplicitArgPtr() const {
return isAmdHsaOS() ? Align(8) : Align(4);
}
/// Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset() const {
switch (TargetTriple.getOS()) {
case Triple::AMDHSA:
case Triple::AMDPAL:
case Triple::Mesa3D:
return 0;
case Triple::UnknownOS:
default:
// For legacy reasons unknown/other is treated as a different version of
// mesa.
return 36;
}
llvm_unreachable("invalid triple OS");
}
/// \returns Maximum number of work groups per compute unit supported by the
/// subtarget and limited by given \p FlatWorkGroupSize.
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
/// \returns Minimum flat work group size supported by the subtarget.
virtual unsigned getMinFlatWorkGroupSize() const = 0;
/// \returns Maximum flat work group size supported by the subtarget.
virtual unsigned getMaxFlatWorkGroupSize() const = 0;
/// \returns Number of waves per execution unit required to support the given
/// \p FlatWorkGroupSize.
virtual unsigned
getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
/// \returns Minimum number of waves per execution unit supported by the
/// subtarget.
virtual unsigned getMinWavesPerEU() const = 0;
/// \returns Maximum number of waves per execution unit supported by the
/// subtarget without any kind of limitation.
unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
/// Return the maximum workitem ID value in the function, for the given (0, 1,
/// 2) dimension.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
/// Return the number of work groups for the function.
SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) const;
/// Return true if only a single workitem can be active in a wave.
bool isSingleLaneExecution(const Function &Kernel) const;
/// Creates value range metadata on an workitemid.* intrinsic call or load.
bool makeLIDRangeMetadata(Instruction *I) const;
/// \returns Number of bytes of arguments that are passed to a shader or
/// kernel in addition to the explicit ones declared for the function.
unsigned getImplicitArgNumBytes(const Function &F) const;
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
/// \returns Corresponding DWARF register number mapping flavour for the
/// \p WavefrontSize.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
virtual ~AMDGPUSubtarget() = default;
};
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H