llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp

//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file defines the WebAssembly-specific TargetTransformInfo
/// implementation.
///
//===----------------------------------------------------------------------===//

#include "WebAssemblyTargetTransformInfo.h"

#include "llvm/CodeGen/CostTable.h"
using namespace llvm;

#define DEBUG_TYPE "wasmtti"

TargetTransformInfo::PopcntSupportKind
WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
  return TargetTransformInfo::PSK_FastHardware;
}

unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
  unsigned Result = BaseT::getNumberOfRegisters(ClassID);

  // For SIMD, use at least 16 registers, as a rough guess.
  bool Vector = (ClassID == 1);
  if (Vector)
    Result = std::max(Result, 16u);

  return Result;
}

TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
    TargetTransformInfo::RegisterKind K) const {
  switch (K) {
  case TargetTransformInfo::RGK_Scalar:
    return TypeSize::getFixed(64);
  case TargetTransformInfo::RGK_FixedWidthVector:
    return TypeSize::getFixed(getST()->hasSIMD128() ? 128 : 64);
  case TargetTransformInfo::RGK_ScalableVector:
    return TypeSize::getScalable(0);
  }

  llvm_unreachable("Unsupported register kind");
}

InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
    ArrayRef<const Value *> Args, const Instruction *CxtI) const {

  InstructionCost Cost =
      BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
          Opcode, Ty, CostKind, Op1Info, Op2Info);

  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
    switch (Opcode) {
    case Instruction::LShr:
    case Instruction::AShr:
    case Instruction::Shl:
      // SIMD128's shifts currently only accept a scalar shift count. For each
      // element, we'll need to extract, op, insert. The following is a rough
      // approximation.
      if (!Op2Info.isUniform())
        Cost =
            cast<FixedVectorType>(VTy)->getNumElements() *
            (TargetTransformInfo::TCC_Basic +
             getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind) +
             TargetTransformInfo::TCC_Basic);
      break;
    }
  }
  return Cost;
}

InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
    unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
    TTI::TargetCostKind CostKind, const Instruction *I) const {
  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  auto SrcTy = TLI->getValueType(DL, Src);
  auto DstTy = TLI->getValueType(DL, Dst);

  if (!SrcTy.isSimple() || !DstTy.isSimple()) {
    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
  }

  if (!ST->hasSIMD128()) {
    return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
  }

  auto DstVT = DstTy.getSimpleVT();
  auto SrcVT = SrcTy.getSimpleVT();

  if (I && I->hasOneUser()) {
    auto *SingleUser = cast<Instruction>(*I->user_begin());
    int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());

    // extmul_low support
    if (UserISD == ISD::MUL &&
        (ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
      // Free low extensions.
      if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
          (SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
          (SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
        return 0;
      }
      // Will require an additional extlow operation for the intermediate
      // i16/i32 value.
      if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
          (SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
        return 1;
      }
    }
  }

  // extend_low
  static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
      {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
      {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
      {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
      {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
      {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
      {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
      {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
      {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
      {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
      {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
  };

  if (const auto *Entry =
          ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
    return Entry->Cost;
  }

  return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
}

WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
  TTI::MemCmpExpansionOptions Options;

  Options.AllowOverlappingLoads = true;

  if (ST->hasSIMD128())
    Options.LoadSizes.push_back(16);

  Options.LoadSizes.append({8, 4, 2, 1});
  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
  Options.NumLoadsPerBlock = Options.MaxNumLoads;

  return Options;
}

InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
    unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
    TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
    const Instruction *I) const {
  if (!ST->hasSIMD128() || !isa<FixedVectorType>(Ty)) {
    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
                                  CostKind);
  }

  EVT VT = TLI->getValueType(DL, Ty, true);
  // Type legalization can't handle structs
  if (VT == MVT::Other)
    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
                                  CostKind);

  auto LT = getTypeLegalizationCost(Ty);
  if (!LT.first.isValid())
    return InstructionCost::getInvalid();

  int ISD = TLI->InstructionOpcodeToISD(Opcode);
  unsigned width = VT.getSizeInBits();
  if (ISD == ISD::LOAD) {
    // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
    // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
    // loads are twice as expensive as scalar.
    switch (width) {
    default:
      break;
    case 32:
    case 64:
    case 128:
      return 2;
    }
  } else if (ISD == ISD::STORE) {
    // For stores, we can use store lane operations.
    switch (width) {
    default:
      break;
    case 8:
    case 16:
    case 32:
    case 64:
    case 128:
      return 2;
    }
  }

  return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
}

InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
    unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
    bool UseMaskForCond, bool UseMaskForGaps) const {
  assert(Factor >= 2 && "Invalid interleave factor");

  auto *VecTy = cast<VectorType>(Ty);
  if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) {
    return InstructionCost::getInvalid();
  }

  if (UseMaskForCond || UseMaskForGaps)
    return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
                                             Alignment, AddressSpace, CostKind,
                                             UseMaskForCond, UseMaskForGaps);

  constexpr unsigned MaxInterleaveFactor = 4;
  if (Factor <= MaxInterleaveFactor) {
    unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
    // Ensure the number of vector elements is greater than 1.
    if (MinElts < 2 || MinElts % Factor != 0)
      return InstructionCost::getInvalid();

    unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
    // Ensure the element type is legal.
    if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
      return InstructionCost::getInvalid();

    auto *SubVecTy =
        VectorType::get(VecTy->getElementType(),
                        VecTy->getElementCount().divideCoefficientBy(Factor));
    InstructionCost MemCost =
        getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);

    unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
    unsigned MaxVecSize = 128;
    unsigned NumAccesses =
        std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);

    // A stride of two is commonly supported via dedicated instructions, so it
    // should be relatively cheap for all element sizes. A stride of four is
    // more expensive as it will likely require more shuffles. Using two
    // simd128 inputs is considered more expensive and we mainly account for
    // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
    // arithmetic kernels.
    static const CostTblEntry ShuffleCostTbl[] = {
        // One reg.
        {2, MVT::v2i8, 1},  // interleave 2 x 2i8 into 4i8
        {2, MVT::v4i8, 1},  // interleave 2 x 4i8 into 8i8
        {2, MVT::v8i8, 1},  // interleave 2 x 8i8 into 16i8
        {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
        {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
        {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32

        // Two regs.
        {2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
        {2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
        {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32

        // One reg.
        {4, MVT::v2i8, 4},  // interleave 4 x 2i8 into 8i8
        {4, MVT::v4i8, 4},  // interleave 4 x 4i8 into 16i8
        {4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16

        // Two regs.
        {4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
        {4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
        {4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32

        // Four regs.
        {4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
    };

    EVT ETy = TLI->getValueType(DL, SubVecTy);
    if (const auto *Entry =
            CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
      return Entry->Cost + (NumAccesses * MemCost);
  }

  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                           Alignment, AddressSpace, CostKind,
                                           UseMaskForCond, UseMaskForGaps);
}

InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
    const Value *Op0, const Value *Op1) const {
  InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
      Opcode, Val, CostKind, Index, Op0, Op1);

  // SIMD128's insert/extract currently only take constant indices.
  if (Index == -1u)
    return Cost + 25 * TargetTransformInfo::TCC_Expensive;

  return Cost;
}

InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
    unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
    ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
    TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
    TTI::TargetCostKind CostKind) const {
  InstructionCost Invalid = InstructionCost::getInvalid();
  if (!VF.isFixed() || !ST->hasSIMD128())
    return Invalid;

  if (CostKind != TTI::TCK_RecipThroughput)
    return Invalid;

  InstructionCost Cost(TTI::TCC_Basic);

  // Possible options:
  // - i16x8.extadd_pairwise_i8x16_sx
  // - i32x4.extadd_pairwise_i16x8_sx
  // - i32x4.dot_i16x8_s
  // Only try to support dot, for now.

  if (Opcode != Instruction::Add)
    return Invalid;

  if (!BinOp || *BinOp != Instruction::Mul)
    return Invalid;

  if (InputTypeA != InputTypeB)
    return Invalid;

  if (OpAExtend != OpBExtend)
    return Invalid;

  EVT InputEVT = EVT::getEVT(InputTypeA);
  EVT AccumEVT = EVT::getEVT(AccumType);

  // TODO: Add i64 accumulator.
  if (AccumEVT != MVT::i32)
    return Invalid;

  // Signed inputs can lower to dot
  if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
    return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;

  // Double the size of the lowered sequence.
  if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
    return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;

  return Invalid;
}

TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
    const IntrinsicInst *II) const {

  switch (II->getIntrinsicID()) {
  default:
    break;
  case Intrinsic::vector_reduce_fadd:
    return TTI::ReductionShuffle::Pairwise;
  }
  return TTI::ReductionShuffle::SplitHalf;
}

void WebAssemblyTTIImpl::getUnrollingPreferences(
    Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
    OptimizationRemarkEmitter *ORE) const {
  // Scan the loop: don't unroll loops with calls. This is a standard approach
  // for most (all?) targets.
  for (BasicBlock *BB : L->blocks())
    for (Instruction &I : *BB)
      if (isa<CallInst>(I) || isa<InvokeInst>(I))
        if (const Function *F = cast<CallBase>(I).getCalledFunction())
          if (isLoweredToCall(F))
            return;

  // The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
  // the various microarchitectures that use the BasicTTI implementation and
  // has been selected through heuristics across multiple cores and runtimes.
  UP.Partial = UP.Runtime = UP.UpperBound = true;
  UP.PartialThreshold = 30;

  // Avoid unrolling when optimizing for size.
  UP.OptSizeThreshold = 0;
  UP.PartialOptSizeThreshold = 0;

  // Set number of instructions optimized when "back edge"
  // becomes "fall through" to default value of 2.
  UP.BEInsns = 2;
}

bool WebAssemblyTTIImpl::supportsTailCalls() const {
  return getST()->hasTailCall();
}

bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
    Instruction *I, SmallVectorImpl<Use *> &Ops) const {
  using namespace llvm::PatternMatch;

  if (!I->getType()->isVectorTy() || !I->isShift())
    return false;

  Value *V = I->getOperand(1);
  // We dont need to sink constant splat.
  if (isa<Constant>(V))
    return false;

  if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
                         m_Value(), m_ZeroMask()))) {
    // Sink insert
    Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
    // Sink shuffle
    Ops.push_back(&I->getOperandUse(1));
    return true;
  }

  return false;
}