llvm-project/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp

//===- GCNVOPDUtils.cpp - GCN VOPD Utils  ------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file This file contains the AMDGPU DAG scheduling
/// mutation to pair VOPD instructions back to back. It also contains
//  subroutines useful in the creation of VOPD instructions
//
//===----------------------------------------------------------------------===//

#include "GCNVOPDUtils.h"
#include "AMDGPUSubtarget.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MacroFusion.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/CodeGen/ScheduleDAGMutation.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/MC/MCInst.h"

using namespace llvm;

#define DEBUG_TYPE "gcn-vopd-utils"

// Check if MI is a VOP3P instruction with operands that satisfy the constraints
// for mapping it to a VOP2/VOPD opcode: no modifiers, no clamp, src1 and src2
// are registers (src0 can be register or literal), and src2 is same as dst.
static bool canMapVOP3PToVOPD(const MachineInstr &MI) {
  unsigned Opc = MI.getOpcode();
  if (Opc != AMDGPU::V_DOT2_F32_F16 && Opc != AMDGPU::V_DOT2_F32_BF16)
    return false;
  // src0 can be register or literal
  int16_t Src0ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
  if (MI.getOperand(Src0ModsIdx).getImm() != SISrcMods::OP_SEL_1)
    return false;
  int16_t Src1ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
  if (MI.getOperand(Src1ModsIdx).getImm() != SISrcMods::OP_SEL_1)
    return false;
  int16_t Src1Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
  if (!MI.getOperand(Src1Idx).isReg())
    return false;
  int16_t Src2ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers);
  if (MI.getOperand(Src2ModsIdx).getImm() != SISrcMods::OP_SEL_1)
    return false;
  int16_t Src2Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
  if (!MI.getOperand(Src2Idx).isReg())
    return false;
  int16_t ClampIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::clamp);
  if (MI.getOperand(ClampIdx).getImm() != 0)
    return false;
  int16_t VdstIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
  return MI.getOperand(VdstIdx).getReg() == MI.getOperand(Src2Idx).getReg();
}

bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
                                   const MachineInstr &MIX,
                                   const MachineInstr &MIY, bool IsVOPD3) {
  namespace VOPD = AMDGPU::VOPD;

  const MachineFunction *MF = MIX.getMF();
  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

  if (IsVOPD3 && !ST.hasVOPD3())
    return false;
  if (!IsVOPD3 && ((TII.isVOP3(MIX) && !canMapVOP3PToVOPD(MIX)) ||
                   (TII.isVOP3(MIY) && !canMapVOP3PToVOPD(MIY))))
    return false;
  if (TII.isDPP(MIX) || TII.isDPP(MIY))
    return false;

  const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
  const MachineRegisterInfo &MRI = MF->getRegInfo();
  // Literals also count against scalar bus limit
  SmallVector<const MachineOperand *> UniqueLiterals;
  auto addLiteral = [&](const MachineOperand &Op) {
    for (auto &Literal : UniqueLiterals) {
      if (Literal->isIdenticalTo(Op))
        return;
    }
    UniqueLiterals.push_back(&Op);
  };
  SmallVector<Register> UniqueScalarRegs;

  // MIX must not modify any registers used by MIY.
  for (const auto &Use : MIY.uses())
    if (Use.isReg() && MIX.modifiesRegister(Use.getReg(), TRI))
      return false;

  auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) {
    const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? MIX : MIY;
    const MachineOperand &Operand = MI.getOperand(OperandIdx);
    if (Operand.isReg() && TRI->isVectorRegister(MRI, Operand.getReg()))
      return Operand.getReg();
    return Register();
  };

  auto InstInfo = AMDGPU::getVOPDInstInfo(MIX.getDesc(), MIY.getDesc());

  for (auto CompIdx : VOPD::COMPONENTS) {
    const MachineInstr &MI = (CompIdx == VOPD::X) ? MIX : MIY;

    const MachineOperand &Src0 = *TII.getNamedOperand(MI, AMDGPU::OpName::src0);
    if (Src0.isReg()) {
      if (!TRI->isVectorRegister(MRI, Src0.getReg())) {
        if (!is_contained(UniqueScalarRegs, Src0.getReg()))
          UniqueScalarRegs.push_back(Src0.getReg());
      }
    } else if (!TII.isInlineConstant(Src0)) {
      if (IsVOPD3)
        return false;
      addLiteral(Src0);
    }

    if (InstInfo[CompIdx].hasMandatoryLiteral()) {
      if (IsVOPD3)
        return false;

      auto CompOprIdx = InstInfo[CompIdx].getMandatoryLiteralCompOperandIndex();
      addLiteral(MI.getOperand(CompOprIdx));
    }
    if (MI.getDesc().hasImplicitUseOfPhysReg(AMDGPU::VCC))
      UniqueScalarRegs.push_back(AMDGPU::VCC_LO);

    if (IsVOPD3) {
      for (auto OpName : {AMDGPU::OpName::src1, AMDGPU::OpName::src2}) {
        const MachineOperand *Src = TII.getNamedOperand(MI, OpName);
        if (!Src)
          continue;
        if (OpName == AMDGPU::OpName::src2) {
          if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::bitop3))
            continue;
          if (MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
            UniqueScalarRegs.push_back(Src->getReg());
            continue;
          }
        }
        if (!Src->isReg() || !TRI->isVGPR(MRI, Src->getReg()))
          return false;
      }

      for (auto OpName : {AMDGPU::OpName::clamp, AMDGPU::OpName::omod,
                          AMDGPU::OpName::op_sel}) {
        if (TII.hasModifiersSet(MI, OpName))
          return false;
      }

      // Neg is allowed, other modifiers are not. NB: even though sext has the
      // same value as neg, there are no combinable instructions with sext.
      for (auto OpName :
           {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
            AMDGPU::OpName::src2_modifiers}) {
        const MachineOperand *Mods = TII.getNamedOperand(MI, OpName);
        if (Mods && (Mods->getImm() & ~SISrcMods::NEG))
          return false;
      }
    }
  }

  if (UniqueLiterals.size() > 1)
    return false;
  if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2)
    return false;

  // On GFX1170+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
  // source-cache.
  bool SkipSrc = (ST.hasGFX11_7Insts() || ST.hasGFX12Insts()) &&
                 MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
                 MIY.getOpcode() == AMDGPU::V_MOV_B32_e32;
  bool AllowSameVGPR = ST.hasGFX1250Insts();

  if (InstInfo.hasInvalidOperand(getVRegIdx, *TRI, SkipSrc, AllowSameVGPR,
                                 IsVOPD3))
    return false;

  if (IsVOPD3) {
    // BITOP3 can be converted to DUAL_BITOP2 only if src2 is zero.
    // MIX check is only relevant to scheduling?
    if (AMDGPU::hasNamedOperand(MIX.getOpcode(), AMDGPU::OpName::bitop3)) {
      const MachineOperand &Src2 =
          *TII.getNamedOperand(MIX, AMDGPU::OpName::src2);
      if (!Src2.isImm() || Src2.getImm())
        return false;
    }
    if (AMDGPU::hasNamedOperand(MIY.getOpcode(), AMDGPU::OpName::bitop3)) {
      const MachineOperand &Src2 =
          *TII.getNamedOperand(MIY, AMDGPU::OpName::src2);
      if (!Src2.isImm() || Src2.getImm())
        return false;
    }
  }

  LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << MIX
                    << "\n\tY: " << MIY << "\n");
  return true;
}

/// Check if the instr pair, FirstMI and SecondMI, should be scheduled
/// together. Given SecondMI, when FirstMI is unspecified, then check if
/// SecondMI may be part of a fused pair at all.
static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
                                       const TargetSubtargetInfo &TSI,
                                       const MachineInstr *FirstMI,
                                       const MachineInstr &SecondMI) {
  const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII);
  const GCNSubtarget &ST = STII.getSubtarget();
  unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(ST);
  unsigned Opc2 = SecondMI.getOpcode();

  const auto checkVOPD = [&](bool VOPD3) -> bool {
    auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2, EncodingFamily, VOPD3);

    // One instruction case
    if (!FirstMI)
      return SecondCanBeVOPD.Y || SecondCanBeVOPD.X;

    unsigned Opc = FirstMI->getOpcode();
    auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc, EncodingFamily, VOPD3);

    if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) ||
          (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
      return false;

#ifdef EXPENSIVE_CHECKS
    assert([&]() -> bool {
      for (auto MII = MachineBasicBlock::const_iterator(FirstMI);
           MII != FirstMI->getParent()->instr_end(); ++MII) {
        if (&*MII == &SecondMI)
          return true;
      }
      return false;
    }() && "Expected FirstMI to precede SecondMI");
#endif

    return checkVOPDRegConstraints(STII, *FirstMI, SecondMI, VOPD3);
  };

  return checkVOPD(false) || (ST.hasVOPD3() && checkVOPD(true));
}

namespace {
/// Adapts design from MacroFusion
/// Puts valid candidate instructions back-to-back so they can easily
/// be turned into VOPD instructions
/// Greedily pairs instruction candidates. O(n^2) algorithm.
struct VOPDPairingMutation : ScheduleDAGMutation {
  MacroFusionPredTy shouldScheduleAdjacent; // NOLINT: function pointer

  VOPDPairingMutation(
      MacroFusionPredTy shouldScheduleAdjacent) // NOLINT: function pointer
      : shouldScheduleAdjacent(shouldScheduleAdjacent) {}

  void apply(ScheduleDAGInstrs *DAG) override {
    const TargetInstrInfo &TII = *DAG->TII;
    const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
    if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) {
      LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n");
      return;
    }

    std::vector<SUnit>::iterator ISUI, JSUI;
    for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) {
      const MachineInstr *IMI = ISUI->getInstr();
      if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI))
        continue;
      if (!hasLessThanNumFused(*ISUI, 2))
        continue;

      for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) {
        if (JSUI->isBoundaryNode())
          continue;
        const MachineInstr *JMI = JSUI->getInstr();
        if (!hasLessThanNumFused(*JSUI, 2) ||
            !shouldScheduleAdjacent(TII, ST, IMI, *JMI))
          continue;
        if (fuseInstructionPair(*DAG, *ISUI, *JSUI))
          break;
      }
    }
    LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n");
  }
};
} // namespace

std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() {
  return std::make_unique<VOPDPairingMutation>(shouldScheduleVOPDAdjacent);
}