
This PR adds more realistic cost estimates for these reduction intrinsics - `llvm.vector.reduce.umax` - `llvm.vector.reduce.umin` - `llvm.vector.reduce.smax` - `llvm.vector.reduce.smin` - `llvm.vector.reduce.fadd` - `llvm.vector.reduce.fmul` - `llvm.vector.reduce.fmax` - `llvm.vector.reduce.fmin` - `llvm.vector.reduce.fmaximum` - `llvm.vector.reduce.fminimum` - `llvm.vector.reduce.mul ` The pre-existing cost estimates for `llvm.vector.reduce.add` are moved to `getArithmeticReductionCosts` to reduce complexity in `getVectorIntrinsicInstrCost` and enable other passes, like the SLP vectorizer, to benefit from these updated calculations. These are not expected to provide noticable performance improvements and are rather provided for the sake of completeness and correctness. This PR is in draft mode pending benchmark confirmation of this. This also provides and/or updates cost tests for all of these intrinsics. This PR was co-authored by me and @JonPsson1 .
150 lines
6.7 KiB
C++
150 lines
6.7 KiB
C++
//===-- SystemZTargetTransformInfo.h - SystemZ-specific TTI ---------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETTRANSFORMINFO_H
|
|
#define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETTRANSFORMINFO_H
|
|
|
|
#include "SystemZTargetMachine.h"
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
#include "llvm/CodeGen/BasicTTIImpl.h"
|
|
|
|
namespace llvm {
|
|
|
|
class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
|
|
typedef BasicTTIImplBase<SystemZTTIImpl> BaseT;
|
|
typedef TargetTransformInfo TTI;
|
|
friend BaseT;
|
|
|
|
const SystemZSubtarget *ST;
|
|
const SystemZTargetLowering *TLI;
|
|
|
|
const SystemZSubtarget *getST() const { return ST; }
|
|
const SystemZTargetLowering *getTLI() const { return TLI; }
|
|
|
|
unsigned const LIBCALL_COST = 30;
|
|
|
|
bool isInt128InVR(Type *Ty) { return Ty->isIntegerTy(128) && ST->hasVector(); }
|
|
|
|
public:
|
|
explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
|
|
: BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
|
|
TLI(ST->getTargetLowering()) {}
|
|
|
|
/// \name Scalar TTI Implementations
|
|
/// @{
|
|
|
|
unsigned adjustInliningThreshold(const CallBase *CB) const;
|
|
|
|
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
|
|
TTI::TargetCostKind CostKind);
|
|
|
|
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
|
|
const APInt &Imm, Type *Ty,
|
|
TTI::TargetCostKind CostKind,
|
|
Instruction *Inst = nullptr);
|
|
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
|
|
const APInt &Imm, Type *Ty,
|
|
TTI::TargetCostKind CostKind);
|
|
|
|
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
|
|
|
|
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
|
TTI::UnrollingPreferences &UP,
|
|
OptimizationRemarkEmitter *ORE);
|
|
|
|
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
|
|
TTI::PeelingPreferences &PP);
|
|
|
|
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
|
|
const TargetTransformInfo::LSRCost &C2);
|
|
/// @}
|
|
|
|
/// \name Vector TTI Implementations
|
|
/// @{
|
|
|
|
unsigned getNumberOfRegisters(unsigned ClassID) const;
|
|
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
|
|
|
|
unsigned getCacheLineSize() const override { return 256; }
|
|
unsigned getPrefetchDistance() const override { return 4500; }
|
|
unsigned getMinPrefetchStride(unsigned NumMemAccesses,
|
|
unsigned NumStridedMemAccesses,
|
|
unsigned NumPrefetches,
|
|
bool HasCall) const override;
|
|
bool enableWritePrefetching() const override { return true; }
|
|
|
|
bool hasDivRemOp(Type *DataType, bool IsSigned);
|
|
bool prefersVectorizedAddressing() { return false; }
|
|
bool LSRWithInstrQueries() { return true; }
|
|
InstructionCost getScalarizationOverhead(VectorType *Ty,
|
|
const APInt &DemandedElts,
|
|
bool Insert, bool Extract,
|
|
TTI::TargetCostKind CostKind,
|
|
ArrayRef<Value *> VL = {});
|
|
bool supportsEfficientVectorElementLoadStore() { return true; }
|
|
bool enableInterleavedAccessVectorization() { return true; }
|
|
|
|
InstructionCost getArithmeticInstrCost(
|
|
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
|
|
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
|
|
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
|
|
ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr);
|
|
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
|
|
ArrayRef<int> Mask,
|
|
TTI::TargetCostKind CostKind, int Index,
|
|
VectorType *SubTp,
|
|
ArrayRef<const Value *> Args = {},
|
|
const Instruction *CxtI = nullptr);
|
|
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
|
|
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
|
|
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
|
|
const Instruction *I);
|
|
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
|
|
TTI::CastContextHint CCH,
|
|
TTI::TargetCostKind CostKind,
|
|
const Instruction *I = nullptr);
|
|
InstructionCost getCmpSelInstrCost(
|
|
unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
|
|
TTI::TargetCostKind CostKind,
|
|
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
|
|
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
|
|
const Instruction *I = nullptr);
|
|
using BaseT::getVectorInstrCost;
|
|
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
|
|
TTI::TargetCostKind CostKind,
|
|
unsigned Index, Value *Op0, Value *Op1);
|
|
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue);
|
|
InstructionCost
|
|
getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
|
|
unsigned AddressSpace, TTI::TargetCostKind CostKind,
|
|
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
|
|
const Instruction *I = nullptr);
|
|
|
|
InstructionCost getInterleavedMemoryOpCost(
|
|
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
|
|
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
|
|
bool UseMaskForCond = false, bool UseMaskForGaps = false);
|
|
|
|
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
|
|
std::optional<FastMathFlags> FMF,
|
|
TTI::TargetCostKind CostKind);
|
|
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
|
|
FastMathFlags FMF,
|
|
TTI::TargetCostKind CostKind);
|
|
|
|
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
|
|
TTI::TargetCostKind CostKind);
|
|
|
|
bool shouldExpandReduction(const IntrinsicInst *II) const;
|
|
/// @}
|
|
};
|
|
|
|
} // end namespace llvm
|
|
|
|
#endif
|