llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
Sam Parker 7b3e77f8d9
[WebAssembly] Implement getInterleavedMemoryOpCost (#146864)
First pass where we calculate the cost of the memory operation, as well
as the shuffles required. Interleaving by a factor of two should be
relatively cheap, as many ISAs have dedicated instructions to perform
the (de)interleaving. Several of these permutations can be combined for
an interleave stride of 4 and this is the highest stride we allow.

I've costed larger vectors, and more lanes, as more expensive because
not only is more work is needed but the risk of codegen going 'wrong'
rises dramatically. I also filled in a bit of cost modelling for vector
stores.

It appears the main vector plan to avoid is an interleave factor of 4
with v16i8. I've used libyuv and ncnn for benchmarking, using V8 on
AArch64, and observe geomean improvement of ~3% with some kernels
improving 40-60%.

I know there is still significant performance being left on the table,
so this will need more development along with the rest of the cost
model.
2025-08-27 12:43:52 +01:00

113 lines
4.3 KiB
C++

//==- WebAssemblyTargetTransformInfo.h - WebAssembly-specific TTI -*- C++ -*-=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file a TargetTransformInfoImplBase conforming object specific
/// to the WebAssembly target machine.
///
/// It uses the target's detailed information to provide more precise answers to
/// certain TTI queries, while letting the target independent and default TTI
/// implementations handle the rest.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETTRANSFORMINFO_H
#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETTRANSFORMINFO_H
#include "WebAssemblyTargetMachine.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include <algorithm>
namespace llvm {
class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
typedef BasicTTIImplBase<WebAssemblyTTIImpl> BaseT;
typedef TargetTransformInfo TTI;
friend BaseT;
const WebAssemblySubtarget *ST;
const WebAssemblyTargetLowering *TLI;
const WebAssemblySubtarget *getST() const { return ST; }
const WebAssemblyTargetLowering *getTLI() const { return TLI; }
public:
WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, const Function &F)
: BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
/// \name Scalar TTI Implementations
/// @{
// TODO: Implement more Scalar TTI for WebAssembly
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) const override;
/// @}
/// \name Vector TTI Implementations
/// @{
bool enableInterleavedAccessVectorization() const override { return true; }
unsigned getNumberOfRegisters(unsigned ClassID) const override;
TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override;
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
ArrayRef<const Value *> Args = {},
const Instruction *CxtI = nullptr) const override;
InstructionCost
getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
TTI::MemCmpExpansionOptions
enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
InstructionCost getMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
const Instruction *I = nullptr) const override;
InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) const override;
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index, const Value *Op0,
const Value *Op1) const override;
InstructionCost getPartialReductionCost(
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
TTI::TargetCostKind CostKind) const override;
TTI::ReductionShuffle
getPreferredExpandedReductionShuffle(const IntrinsicInst *II) const override;
bool supportsTailCalls() const override;
bool isProfitableToSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const override;
/// @}
};
} // end namespace llvm
#endif