For V_DOT2_F32_F16 and V_DOT2_F32_BF16 add their VOPDName and mark them with usesCustomInserter which will be used to add pre-RA register allocation hints to preferably assign dst and src2 to the same physical register. When the hint is satisfied, canMapVOP3PToVOPD recognises the instruction as eligible for VOPD pairing by checking if it is VOP2 like: dst==src2, no source modifiers, no clamp, and src1 is a register. Mark both instructions as commutable to allow a literal in src1 to be moved to src0, since VOPD only permits a literal in src0.
299 lines
11 KiB
C++
299 lines
11 KiB
C++
//===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file This file contains the AMDGPU DAG scheduling
|
|
/// mutation to pair VOPD instructions back to back. It also contains
|
|
// subroutines useful in the creation of VOPD instructions
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "GCNVOPDUtils.h"
|
|
#include "AMDGPUSubtarget.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
#include "SIInstrInfo.h"
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
#include "llvm/CodeGen/MachineInstr.h"
|
|
#include "llvm/CodeGen/MachineOperand.h"
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
#include "llvm/CodeGen/MacroFusion.h"
|
|
#include "llvm/CodeGen/ScheduleDAG.h"
|
|
#include "llvm/CodeGen/ScheduleDAGMutation.h"
|
|
#include "llvm/CodeGen/TargetInstrInfo.h"
|
|
#include "llvm/MC/MCInst.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "gcn-vopd-utils"
|
|
|
|
// Check if MI is a VOP3P instruction with operands that satisfy the constraints
|
|
// for mapping it to a VOP2/VOPD opcode: no modifiers, no clamp, src1 and src2
|
|
// are registers (src0 can be register or literal), and src2 is same as dst.
|
|
static bool canMapVOP3PToVOPD(const MachineInstr &MI) {
|
|
unsigned Opc = MI.getOpcode();
|
|
if (Opc != AMDGPU::V_DOT2_F32_F16 && Opc != AMDGPU::V_DOT2_F32_BF16)
|
|
return false;
|
|
// src0 can be register or literal
|
|
int16_t Src0ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
|
|
if (MI.getOperand(Src0ModsIdx).getImm() != SISrcMods::OP_SEL_1)
|
|
return false;
|
|
int16_t Src1ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
|
|
if (MI.getOperand(Src1ModsIdx).getImm() != SISrcMods::OP_SEL_1)
|
|
return false;
|
|
int16_t Src1Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
|
|
if (!MI.getOperand(Src1Idx).isReg())
|
|
return false;
|
|
int16_t Src2ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers);
|
|
if (MI.getOperand(Src2ModsIdx).getImm() != SISrcMods::OP_SEL_1)
|
|
return false;
|
|
int16_t Src2Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
|
|
if (!MI.getOperand(Src2Idx).isReg())
|
|
return false;
|
|
int16_t ClampIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::clamp);
|
|
if (MI.getOperand(ClampIdx).getImm() != 0)
|
|
return false;
|
|
int16_t VdstIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
|
|
return MI.getOperand(VdstIdx).getReg() == MI.getOperand(Src2Idx).getReg();
|
|
}
|
|
|
|
bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
|
|
const MachineInstr &MIX,
|
|
const MachineInstr &MIY, bool IsVOPD3) {
|
|
namespace VOPD = AMDGPU::VOPD;
|
|
|
|
const MachineFunction *MF = MIX.getMF();
|
|
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
|
|
|
|
if (IsVOPD3 && !ST.hasVOPD3())
|
|
return false;
|
|
if (!IsVOPD3 && ((TII.isVOP3(MIX) && !canMapVOP3PToVOPD(MIX)) ||
|
|
(TII.isVOP3(MIY) && !canMapVOP3PToVOPD(MIY))))
|
|
return false;
|
|
if (TII.isDPP(MIX) || TII.isDPP(MIY))
|
|
return false;
|
|
|
|
const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
|
|
const MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
// Literals also count against scalar bus limit
|
|
SmallVector<const MachineOperand *> UniqueLiterals;
|
|
auto addLiteral = [&](const MachineOperand &Op) {
|
|
for (auto &Literal : UniqueLiterals) {
|
|
if (Literal->isIdenticalTo(Op))
|
|
return;
|
|
}
|
|
UniqueLiterals.push_back(&Op);
|
|
};
|
|
SmallVector<Register> UniqueScalarRegs;
|
|
|
|
// MIX must not modify any registers used by MIY.
|
|
for (const auto &Use : MIY.uses())
|
|
if (Use.isReg() && MIX.modifiesRegister(Use.getReg(), TRI))
|
|
return false;
|
|
|
|
auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) {
|
|
const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? MIX : MIY;
|
|
const MachineOperand &Operand = MI.getOperand(OperandIdx);
|
|
if (Operand.isReg() && TRI->isVectorRegister(MRI, Operand.getReg()))
|
|
return Operand.getReg();
|
|
return Register();
|
|
};
|
|
|
|
auto InstInfo = AMDGPU::getVOPDInstInfo(MIX.getDesc(), MIY.getDesc());
|
|
|
|
for (auto CompIdx : VOPD::COMPONENTS) {
|
|
const MachineInstr &MI = (CompIdx == VOPD::X) ? MIX : MIY;
|
|
|
|
const MachineOperand &Src0 = *TII.getNamedOperand(MI, AMDGPU::OpName::src0);
|
|
if (Src0.isReg()) {
|
|
if (!TRI->isVectorRegister(MRI, Src0.getReg())) {
|
|
if (!is_contained(UniqueScalarRegs, Src0.getReg()))
|
|
UniqueScalarRegs.push_back(Src0.getReg());
|
|
}
|
|
} else if (!TII.isInlineConstant(Src0)) {
|
|
if (IsVOPD3)
|
|
return false;
|
|
addLiteral(Src0);
|
|
}
|
|
|
|
if (InstInfo[CompIdx].hasMandatoryLiteral()) {
|
|
if (IsVOPD3)
|
|
return false;
|
|
|
|
auto CompOprIdx = InstInfo[CompIdx].getMandatoryLiteralCompOperandIndex();
|
|
addLiteral(MI.getOperand(CompOprIdx));
|
|
}
|
|
if (MI.getDesc().hasImplicitUseOfPhysReg(AMDGPU::VCC))
|
|
UniqueScalarRegs.push_back(AMDGPU::VCC_LO);
|
|
|
|
if (IsVOPD3) {
|
|
for (auto OpName : {AMDGPU::OpName::src1, AMDGPU::OpName::src2}) {
|
|
const MachineOperand *Src = TII.getNamedOperand(MI, OpName);
|
|
if (!Src)
|
|
continue;
|
|
if (OpName == AMDGPU::OpName::src2) {
|
|
if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::bitop3))
|
|
continue;
|
|
if (MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
|
|
UniqueScalarRegs.push_back(Src->getReg());
|
|
continue;
|
|
}
|
|
}
|
|
if (!Src->isReg() || !TRI->isVGPR(MRI, Src->getReg()))
|
|
return false;
|
|
}
|
|
|
|
for (auto OpName : {AMDGPU::OpName::clamp, AMDGPU::OpName::omod,
|
|
AMDGPU::OpName::op_sel}) {
|
|
if (TII.hasModifiersSet(MI, OpName))
|
|
return false;
|
|
}
|
|
|
|
// Neg is allowed, other modifiers are not. NB: even though sext has the
|
|
// same value as neg, there are no combinable instructions with sext.
|
|
for (auto OpName :
|
|
{AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
|
|
AMDGPU::OpName::src2_modifiers}) {
|
|
const MachineOperand *Mods = TII.getNamedOperand(MI, OpName);
|
|
if (Mods && (Mods->getImm() & ~SISrcMods::NEG))
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (UniqueLiterals.size() > 1)
|
|
return false;
|
|
if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2)
|
|
return false;
|
|
|
|
// On GFX1170+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
|
|
// source-cache.
|
|
bool SkipSrc = (ST.hasGFX11_7Insts() || ST.hasGFX12Insts()) &&
|
|
MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
|
|
MIY.getOpcode() == AMDGPU::V_MOV_B32_e32;
|
|
bool AllowSameVGPR = ST.hasGFX1250Insts();
|
|
|
|
if (InstInfo.hasInvalidOperand(getVRegIdx, *TRI, SkipSrc, AllowSameVGPR,
|
|
IsVOPD3))
|
|
return false;
|
|
|
|
if (IsVOPD3) {
|
|
// BITOP3 can be converted to DUAL_BITOP2 only if src2 is zero.
|
|
// MIX check is only relevant to scheduling?
|
|
if (AMDGPU::hasNamedOperand(MIX.getOpcode(), AMDGPU::OpName::bitop3)) {
|
|
const MachineOperand &Src2 =
|
|
*TII.getNamedOperand(MIX, AMDGPU::OpName::src2);
|
|
if (!Src2.isImm() || Src2.getImm())
|
|
return false;
|
|
}
|
|
if (AMDGPU::hasNamedOperand(MIY.getOpcode(), AMDGPU::OpName::bitop3)) {
|
|
const MachineOperand &Src2 =
|
|
*TII.getNamedOperand(MIY, AMDGPU::OpName::src2);
|
|
if (!Src2.isImm() || Src2.getImm())
|
|
return false;
|
|
}
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << MIX
|
|
<< "\n\tY: " << MIY << "\n");
|
|
return true;
|
|
}
|
|
|
|
/// Check if the instr pair, FirstMI and SecondMI, should be scheduled
|
|
/// together. Given SecondMI, when FirstMI is unspecified, then check if
|
|
/// SecondMI may be part of a fused pair at all.
|
|
static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
|
|
const TargetSubtargetInfo &TSI,
|
|
const MachineInstr *FirstMI,
|
|
const MachineInstr &SecondMI) {
|
|
const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII);
|
|
const GCNSubtarget &ST = STII.getSubtarget();
|
|
unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(ST);
|
|
unsigned Opc2 = SecondMI.getOpcode();
|
|
|
|
const auto checkVOPD = [&](bool VOPD3) -> bool {
|
|
auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2, EncodingFamily, VOPD3);
|
|
|
|
// One instruction case
|
|
if (!FirstMI)
|
|
return SecondCanBeVOPD.Y || SecondCanBeVOPD.X;
|
|
|
|
unsigned Opc = FirstMI->getOpcode();
|
|
auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc, EncodingFamily, VOPD3);
|
|
|
|
if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) ||
|
|
(FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
|
|
return false;
|
|
|
|
#ifdef EXPENSIVE_CHECKS
|
|
assert([&]() -> bool {
|
|
for (auto MII = MachineBasicBlock::const_iterator(FirstMI);
|
|
MII != FirstMI->getParent()->instr_end(); ++MII) {
|
|
if (&*MII == &SecondMI)
|
|
return true;
|
|
}
|
|
return false;
|
|
}() && "Expected FirstMI to precede SecondMI");
|
|
#endif
|
|
|
|
return checkVOPDRegConstraints(STII, *FirstMI, SecondMI, VOPD3);
|
|
};
|
|
|
|
return checkVOPD(false) || (ST.hasVOPD3() && checkVOPD(true));
|
|
}
|
|
|
|
namespace {
|
|
/// Adapts design from MacroFusion
|
|
/// Puts valid candidate instructions back-to-back so they can easily
|
|
/// be turned into VOPD instructions
|
|
/// Greedily pairs instruction candidates. O(n^2) algorithm.
|
|
struct VOPDPairingMutation : ScheduleDAGMutation {
|
|
MacroFusionPredTy shouldScheduleAdjacent; // NOLINT: function pointer
|
|
|
|
VOPDPairingMutation(
|
|
MacroFusionPredTy shouldScheduleAdjacent) // NOLINT: function pointer
|
|
: shouldScheduleAdjacent(shouldScheduleAdjacent) {}
|
|
|
|
void apply(ScheduleDAGInstrs *DAG) override {
|
|
const TargetInstrInfo &TII = *DAG->TII;
|
|
const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
|
|
if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) {
|
|
LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n");
|
|
return;
|
|
}
|
|
|
|
std::vector<SUnit>::iterator ISUI, JSUI;
|
|
for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) {
|
|
const MachineInstr *IMI = ISUI->getInstr();
|
|
if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI))
|
|
continue;
|
|
if (!hasLessThanNumFused(*ISUI, 2))
|
|
continue;
|
|
|
|
for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) {
|
|
if (JSUI->isBoundaryNode())
|
|
continue;
|
|
const MachineInstr *JMI = JSUI->getInstr();
|
|
if (!hasLessThanNumFused(*JSUI, 2) ||
|
|
!shouldScheduleAdjacent(TII, ST, IMI, *JMI))
|
|
continue;
|
|
if (fuseInstructionPair(*DAG, *ISUI, *JSUI))
|
|
break;
|
|
}
|
|
}
|
|
LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n");
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() {
|
|
return std::make_unique<VOPDPairingMutation>(shouldScheduleVOPDAdjacent);
|
|
}
|