llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Gheorghe-Teodor Bercea 3df36a2b18
[AMDGPU] Enable vectorization of i8 values. (#134934)
This patch adjusts the cost model to account for the ability of the
AMDGPU optimizer to group together i8 values into i32 values.

Co-authored-by: Erich Keane <ekeane@nvidia.com>
2025-06-26 19:15:31 -04:00

310 lines
12 KiB
C++

//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This file a TargetTransformInfoImplBase conforming object specific to the
/// AMDGPU target machine. It uses the target's detailed information to
/// provide more precise answers to certain TTI queries, while letting the
/// target independent and default TTI implementations handle the rest.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
#include "AMDGPU.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/Support/AMDGPUAddrSpace.h"
#include <optional>
namespace llvm {
class AMDGPUTargetMachine;
class GCNSubtarget;
class InstCombiner;
class Loop;
class ScalarEvolution;
class SITargetLowering;
class Type;
class Value;
class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>;
using TTI = TargetTransformInfo;
friend BaseT;
Triple TargetTriple;
const TargetSubtargetInfo *ST;
const TargetLoweringBase *TLI;
const TargetSubtargetInfo *getST() const { return ST; }
const TargetLoweringBase *getTLI() const { return TLI; }
public:
explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) const override;
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP) const override;
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
};
class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
using BaseT = BasicTTIImplBase<GCNTTIImpl>;
using TTI = TargetTransformInfo;
friend BaseT;
const GCNSubtarget *ST;
const SITargetLowering *TLI;
AMDGPUTTIImpl CommonTTI;
bool IsGraphics;
bool HasFP32Denormals;
bool HasFP64FP16Denormals;
static constexpr bool InlinerVectorBonusPercent = 0;
static const FeatureBitset InlineFeatureIgnoreList;
const GCNSubtarget *getST() const { return ST; }
const SITargetLowering *getTLI() const { return TLI; }
static inline int getFullRateInstrCost() {
return TargetTransformInfo::TCC_Basic;
}
static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
return CostKind == TTI::TCK_CodeSize ? 2
: 2 * TargetTransformInfo::TCC_Basic;
}
// TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
// should be 2 or 4.
static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
return CostKind == TTI::TCK_CodeSize ? 2
: 4 * TargetTransformInfo::TCC_Basic;
}
// On some parts, normal fp64 operations are half rate, and others
// quarter. This also applies to some integer operations.
int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
public:
explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
bool hasBranchDivergence(const Function *F = nullptr) const override;
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) const override;
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP) const override;
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
return TTI::PSK_FastHardware;
}
unsigned getNumberOfRegisters(unsigned RCID) const override;
TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override;
unsigned getMinVectorRegisterBitWidth() const override;
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override;
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const override;
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const override;
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override;
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const override;
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const override;
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override;
Type *getMemcpyLoopLoweringType(
LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
std::optional<uint32_t> AtomicElementSize) const override;
void getMemcpyLoopResidualLoweringType(
SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
Align SrcAlign, Align DestAlign,
std::optional<uint32_t> AtomicCpySize) const override;
unsigned getMaxInterleaveFactor(ElementCount VF) const override;
bool getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const override;
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
ArrayRef<const Value *> Args = {},
const Instruction *CxtI = nullptr) const override;
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
bool isInlineAsmSourceOfDivergence(const CallInst *CI,
ArrayRef<unsigned> Indices = {}) const;
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
TTI::TargetCostKind CostKind,
unsigned Index, const Value *Op0,
const Value *Op1) const override;
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
bool isSourceOfDivergence(const Value *V) const override;
bool isAlwaysUniform(const Value *V) const override;
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const override {
// Address space casts must cast between different address spaces.
if (FromAS == ToAS)
return false;
// Casts between any aliasing address spaces are valid.
return AMDGPU::addrspacesMayAlias(FromAS, ToAS);
}
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const override {
return AMDGPU::addrspacesMayAlias(AS0, AS1);
}
unsigned getFlatAddressSpace() const override {
// Don't bother running InferAddressSpaces pass on graphics shaders which
// don't use flat addressing.
if (IsGraphics)
return -1;
return AMDGPUAS::FLAT_ADDRESS;
}
bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const override;
bool
canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const override {
return AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
AS != AMDGPUAS::PRIVATE_ADDRESS;
}
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const override;
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
const Value *Op1, InstCombiner &IC) const;
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
unsigned LaneAgIdx) const;
std::optional<Instruction *>
instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
Value *simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC,
IntrinsicInst &II,
const APInt &DemandedElts,
APInt &UndefElts) const;
Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
IntrinsicInst &II) const;
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
APInt &UndefElts2, APInt &UndefElts3,
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const override;
InstructionCost getVectorSplitCost() const { return 0; }
InstructionCost
getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
VectorType *SubTp, ArrayRef<const Value *> Args = {},
const Instruction *CxtI = nullptr) const override;
bool isProfitableToSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const override;
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const override;
int getInliningLastCallToStaticBonus() const override;
unsigned getInliningThresholdMultiplier() const override { return 11; }
unsigned adjustInliningThreshold(const CallBase *CB) const override;
unsigned getCallerAllocaCost(const CallBase *CB,
const AllocaInst *AI) const override;
int getInlinerVectorBonusPercent() const override {
return InlinerVectorBonusPercent;
}
InstructionCost
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
std::optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) const override;
InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) const override;
InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
TTI::TargetCostKind CostKind) const override;
/// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
unsigned getCacheLineSize() const override { return 128; }
/// How much before a load we should place the prefetch instruction.
/// This is currently measured in number of IR instructions.
unsigned getPrefetchDistance() const override;
/// \return if target want to issue a prefetch in address space \p AS.
bool shouldPrefetchAddressSpace(unsigned AS) const override;
void collectKernelLaunchBounds(
const Function &F,
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
enum class KnownIEEEMode { Unknown, On, Off };
/// Return KnownIEEEMode::On if we know if the use context can assume
/// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
/// "amdgpu-ieee"="false".
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
/// Account for loads of i8 vector types to have reduced cost. For
/// example the cost of load 4 i8s values is one is the cost of loading
/// a single i32 value.
InstructionCost getMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
const Instruction *I = nullptr) const override;
/// When counting parts on AMD GPUs, account for i8s being grouped
/// together under a single i32 value. Otherwise fall back to base
/// implementation.
unsigned getNumberOfParts(Type *Tp) const override;
};
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H