First pass where we calculate the cost of the memory operation, as well as the shuffles required. Interleaving by a factor of two should be relatively cheap, as many ISAs have dedicated instructions to perform the (de)interleaving. Several of these permutations can be combined for an interleave stride of 4 and this is the highest stride we allow. I've costed larger vectors, and more lanes, as more expensive because not only is more work is needed but the risk of codegen going 'wrong' rises dramatically. I also filled in a bit of cost modelling for vector stores. It appears the main vector plan to avoid is an interleave factor of 4 with v16i8. I've used libyuv and ncnn for benchmarking, using V8 on AArch64, and observe geomean improvement of ~3% with some kernels improving 40-60%. I know there is still significant performance being left on the table, so this will need more development along with the rest of the cost model.
113 lines
4.3 KiB
C++
113 lines
4.3 KiB
C++
//==- WebAssemblyTargetTransformInfo.h - WebAssembly-specific TTI -*- C++ -*-=//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// This file a TargetTransformInfoImplBase conforming object specific
|
|
/// to the WebAssembly target machine.
|
|
///
|
|
/// It uses the target's detailed information to provide more precise answers to
|
|
/// certain TTI queries, while letting the target independent and default TTI
|
|
/// implementations handle the rest.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETTRANSFORMINFO_H
|
|
#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETTRANSFORMINFO_H
|
|
|
|
#include "WebAssemblyTargetMachine.h"
|
|
#include "llvm/CodeGen/BasicTTIImpl.h"
|
|
#include <algorithm>
|
|
|
|
namespace llvm {
|
|
|
|
class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
|
|
typedef BasicTTIImplBase<WebAssemblyTTIImpl> BaseT;
|
|
typedef TargetTransformInfo TTI;
|
|
friend BaseT;
|
|
|
|
const WebAssemblySubtarget *ST;
|
|
const WebAssemblyTargetLowering *TLI;
|
|
|
|
const WebAssemblySubtarget *getST() const { return ST; }
|
|
const WebAssemblyTargetLowering *getTLI() const { return TLI; }
|
|
|
|
public:
|
|
WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, const Function &F)
|
|
: BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
|
|
TLI(ST->getTargetLowering()) {}
|
|
|
|
/// \name Scalar TTI Implementations
|
|
/// @{
|
|
|
|
// TODO: Implement more Scalar TTI for WebAssembly
|
|
|
|
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
|
|
|
|
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
|
TTI::UnrollingPreferences &UP,
|
|
OptimizationRemarkEmitter *ORE) const override;
|
|
|
|
/// @}
|
|
|
|
/// \name Vector TTI Implementations
|
|
/// @{
|
|
|
|
bool enableInterleavedAccessVectorization() const override { return true; }
|
|
|
|
unsigned getNumberOfRegisters(unsigned ClassID) const override;
|
|
TypeSize
|
|
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override;
|
|
InstructionCost getArithmeticInstrCost(
|
|
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
|
|
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
|
|
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
|
|
ArrayRef<const Value *> Args = {},
|
|
const Instruction *CxtI = nullptr) const override;
|
|
|
|
InstructionCost
|
|
getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
|
|
TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
|
|
const Instruction *I = nullptr) const override;
|
|
|
|
TTI::MemCmpExpansionOptions
|
|
enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
|
|
|
|
InstructionCost getMemoryOpCost(
|
|
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
|
|
TTI::TargetCostKind CostKind,
|
|
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
|
|
const Instruction *I = nullptr) const override;
|
|
InstructionCost getInterleavedMemoryOpCost(
|
|
unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
|
|
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
|
|
bool UseMaskForCond, bool UseMaskForGaps) const override;
|
|
using BaseT::getVectorInstrCost;
|
|
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
|
|
TTI::TargetCostKind CostKind,
|
|
unsigned Index, const Value *Op0,
|
|
const Value *Op1) const override;
|
|
InstructionCost getPartialReductionCost(
|
|
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
|
|
ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
|
|
TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
|
|
TTI::TargetCostKind CostKind) const override;
|
|
TTI::ReductionShuffle
|
|
getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const override;
|
|
|
|
bool supportsTailCalls() const override;
|
|
|
|
bool isProfitableToSinkOperands(Instruction *I,
|
|
SmallVectorImpl<Use *> &Ops) const override;
|
|
|
|
/// @}
|
|
};
|
|
|
|
} // end namespace llvm
|
|
|
|
#endif
|