374 lines
11 KiB
C++
374 lines
11 KiB
C++
//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// Lower VGPRs above first 256 on gfx1250.
|
|
///
|
|
/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
|
|
/// VGPR addressing mode. The mode change is effective until the next change.
|
|
/// This instruction provides high bits of a VGPR address for four of the
|
|
/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
|
|
/// instruction encoding. If bits are set they are added as MSB to the
|
|
/// corresponding operand VGPR number.
|
|
///
|
|
/// There is no need to replace actual register operands because encoding of the
|
|
/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
|
|
/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
|
|
/// VGPRs will survive until actual encoding and will result in a same actual
|
|
/// bit encoding.
|
|
///
|
|
/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
|
|
/// to a VGPR address of the subseqent instructions. The InstPrinter will take
|
|
/// care of the printing a low VGPR instead of a high one. In prinicple this
|
|
/// shall be viable to print actual high VGPR numbers, but that would disagree
|
|
/// with a disasm printing and create a situation where asm text is not
|
|
/// deterministic.
|
|
///
|
|
/// This pass creates a convention where non-fall through basic blocks shall
|
|
/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
|
|
/// An optimization here is possible but deemed not desirable because of the
|
|
/// readbility concerns.
|
|
///
|
|
/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
|
|
/// The pass must run very late in the pipeline to make sure no changes to VGPR
|
|
/// operands will be made after it.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPULowerVGPREncoding.h"
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
#include "SIInstrInfo.h"
|
|
#include "llvm/ADT/PackedVector.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"
|
|
|
|
namespace {
|
|
|
|
class AMDGPULowerVGPREncoding {
|
|
static constexpr unsigned OpNum = 4;
|
|
static constexpr unsigned BitsPerField = 2;
|
|
static constexpr unsigned NumFields = 4;
|
|
static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
|
|
using ModeType = PackedVector<unsigned, BitsPerField,
|
|
std::bitset<BitsPerField * NumFields>>;
|
|
|
|
class ModeTy : public ModeType {
|
|
public:
|
|
// bitset constructor will set all bits to zero
|
|
ModeTy() : ModeType(0) {}
|
|
|
|
operator int64_t() const { return raw_bits().to_ulong(); }
|
|
|
|
static ModeTy fullMask() {
|
|
ModeTy M;
|
|
M.raw_bits().flip();
|
|
return M;
|
|
}
|
|
};
|
|
|
|
public:
|
|
bool run(MachineFunction &MF);
|
|
|
|
private:
|
|
const SIInstrInfo *TII;
|
|
const SIRegisterInfo *TRI;
|
|
|
|
/// Most recent s_set_* instruction.
|
|
MachineInstr *MostRecentModeSet;
|
|
|
|
/// Whether the current mode is known.
|
|
bool CurrentModeKnown;
|
|
|
|
/// Current mode bits.
|
|
ModeTy CurrentMode;
|
|
|
|
/// Current mask of mode bits that instructions since MostRecentModeSet care
|
|
/// about.
|
|
ModeTy CurrentMask;
|
|
|
|
/// Number of current hard clause instructions.
|
|
unsigned ClauseLen;
|
|
|
|
/// Number of hard clause instructions remaining.
|
|
unsigned ClauseRemaining;
|
|
|
|
/// Clause group breaks.
|
|
unsigned ClauseBreaks;
|
|
|
|
/// Last hard clause instruction.
|
|
MachineInstr *Clause;
|
|
|
|
/// Insert mode change before \p I. \returns true if mode was changed.
|
|
bool setMode(ModeTy NewMode, ModeTy Mask, MachineInstr *I);
|
|
|
|
/// Reset mode to default.
|
|
void resetMode(MachineInstr *I) { setMode(ModeTy(), ModeTy::fullMask(), I); }
|
|
|
|
/// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
|
|
std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
|
|
|
|
/// Handle single \p MI. \return true if changed.
|
|
bool runOnMachineInstr(MachineInstr &MI);
|
|
|
|
/// Compute the mode and mode mask for a single \p MI given \p Ops operands
|
|
/// bit mapping. Optionally takes second array \p Ops2 for VOPD.
|
|
/// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
|
|
/// is checked.
|
|
void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
|
|
const AMDGPU::OpName Ops[OpNum],
|
|
const AMDGPU::OpName *Ops2 = nullptr);
|
|
|
|
/// Check if an instruction \p I is within a clause and returns a suitable
|
|
/// iterator to insert mode change. It may also modify the S_CLAUSE
|
|
/// instruction to extend it or drop the clause if it cannot be adjusted.
|
|
MachineInstr *handleClause(MachineInstr *I);
|
|
};
|
|
|
|
bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
|
|
MachineInstr *I) {
|
|
assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
|
|
|
|
if (CurrentModeKnown) {
|
|
auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
|
|
|
|
if ((Delta & Mask.raw_bits()).none()) {
|
|
CurrentMask |= Mask;
|
|
return false;
|
|
}
|
|
|
|
if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
|
|
CurrentMode |= NewMode;
|
|
CurrentMask |= Mask;
|
|
|
|
MostRecentModeSet->getOperand(0).setImm(CurrentMode);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
I = handleClause(I);
|
|
MostRecentModeSet =
|
|
BuildMI(*I->getParent(), I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
|
|
.addImm(NewMode);
|
|
|
|
CurrentMode = NewMode;
|
|
CurrentMask = Mask;
|
|
CurrentModeKnown = true;
|
|
return true;
|
|
}
|
|
|
|
std::optional<unsigned>
|
|
AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
|
|
if (!MO.isReg())
|
|
return std::nullopt;
|
|
|
|
MCRegister Reg = MO.getReg();
|
|
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
|
|
if (!RC || !TRI->isVGPRClass(RC))
|
|
return std::nullopt;
|
|
|
|
unsigned Idx = TRI->getHWRegIndex(Reg);
|
|
return Idx >> 8;
|
|
}
|
|
|
|
void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
|
|
MachineInstr &MI,
|
|
const AMDGPU::OpName Ops[OpNum],
|
|
const AMDGPU::OpName *Ops2) {
|
|
NewMode = {};
|
|
Mask = {};
|
|
|
|
for (unsigned I = 0; I < OpNum; ++I) {
|
|
MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
|
|
|
|
std::optional<unsigned> MSBits;
|
|
if (Op)
|
|
MSBits = getMSBs(*Op);
|
|
|
|
#if !defined(NDEBUG)
|
|
if (MSBits.has_value() && Ops2) {
|
|
auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
|
|
if (Op2) {
|
|
std::optional<unsigned> MSBits2;
|
|
MSBits2 = getMSBs(*Op2);
|
|
if (MSBits2.has_value() && MSBits != MSBits2)
|
|
llvm_unreachable("Invalid VOPD pair was created");
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (!MSBits.has_value() && Ops2) {
|
|
Op = TII->getNamedOperand(MI, Ops2[I]);
|
|
if (Op)
|
|
MSBits = getMSBs(*Op);
|
|
}
|
|
|
|
if (!MSBits.has_value())
|
|
continue;
|
|
|
|
// Skip tied uses of src2 of VOP2, these will be handled along with defs and
|
|
// only vdst bit affects these operands. We cannot skip tied uses of VOP3,
|
|
// these uses are real even if must match the vdst.
|
|
if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
|
|
(SIInstrInfo::isVOP2(MI) ||
|
|
(SIInstrInfo::isVOP3(MI) &&
|
|
TII->hasVALU32BitEncoding(MI.getOpcode()))))
|
|
continue;
|
|
|
|
NewMode[I] = MSBits.value();
|
|
Mask[I] = FieldMask;
|
|
}
|
|
}
|
|
|
|
bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
|
|
auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
|
|
if (Ops.first) {
|
|
ModeTy NewMode, Mask;
|
|
computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
|
|
return setMode(NewMode, Mask, &MI);
|
|
}
|
|
assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
|
|
|
|
return false;
|
|
}
|
|
|
|
MachineInstr *AMDGPULowerVGPREncoding::handleClause(MachineInstr *I) {
|
|
if (!ClauseRemaining)
|
|
return I;
|
|
|
|
// A clause cannot start with a special instruction, place it right before
|
|
// the clause.
|
|
if (ClauseRemaining == ClauseLen) {
|
|
I = Clause->getPrevNode();
|
|
assert(I->isBundle());
|
|
return I;
|
|
}
|
|
|
|
// If a clause defines breaks each group cannot start with a mode change.
|
|
// just drop the clause.
|
|
if (ClauseBreaks) {
|
|
Clause->eraseFromBundle();
|
|
ClauseRemaining = 0;
|
|
return I;
|
|
}
|
|
|
|
// Otherwise adjust a number of instructions in the clause if it fits.
|
|
// If it does not clause will just become shorter. Since the length
|
|
// recorded in the clause is one less, increment the length after the
|
|
// update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
|
|
if (ClauseLen < 63)
|
|
Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8));
|
|
|
|
++ClauseLen;
|
|
|
|
return I;
|
|
}
|
|
|
|
bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
if (!ST.has1024AddressableVGPRs())
|
|
return false;
|
|
|
|
TII = ST.getInstrInfo();
|
|
TRI = ST.getRegisterInfo();
|
|
|
|
bool Changed = false;
|
|
ClauseLen = ClauseRemaining = 0;
|
|
CurrentMode.reset();
|
|
CurrentMask.reset();
|
|
CurrentModeKnown = true;
|
|
for (auto &MBB : MF) {
|
|
MostRecentModeSet = nullptr;
|
|
|
|
for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
|
|
if (MI.isMetaInstruction())
|
|
continue;
|
|
|
|
if (MI.isTerminator() || MI.isCall()) {
|
|
if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
|
|
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
|
|
CurrentMode.reset();
|
|
CurrentModeKnown = true;
|
|
} else
|
|
resetMode(&MI);
|
|
continue;
|
|
}
|
|
|
|
if (MI.isInlineAsm()) {
|
|
if (TII->hasVGPRUses(MI))
|
|
resetMode(&MI);
|
|
continue;
|
|
}
|
|
|
|
if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
|
|
assert(!ClauseRemaining && "Nested clauses are not supported");
|
|
ClauseLen = MI.getOperand(0).getImm();
|
|
ClauseBreaks = (ClauseLen >> 8) & 15;
|
|
ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
|
|
Clause = &MI;
|
|
continue;
|
|
}
|
|
|
|
Changed |= runOnMachineInstr(MI);
|
|
|
|
if (ClauseRemaining)
|
|
--ClauseRemaining;
|
|
}
|
|
|
|
// If we're falling through to a block that has at least one other
|
|
// predecessor, we no longer know the mode.
|
|
MachineBasicBlock *Next = MBB.getNextNode();
|
|
if (Next && Next->pred_size() >= 2 &&
|
|
llvm::is_contained(Next->predecessors(), &MBB)) {
|
|
if (CurrentMode.raw_bits().any())
|
|
CurrentModeKnown = false;
|
|
}
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override {
|
|
return AMDGPULowerVGPREncoding().run(MF);
|
|
}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesCFG();
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
char AMDGPULowerVGPREncodingLegacy::ID = 0;
|
|
|
|
char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID;
|
|
|
|
INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE,
|
|
"AMDGPU Lower VGPR Encoding", false, false)
|
|
|
|
PreservedAnalyses
|
|
AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
|
|
MachineFunctionAnalysisManager &MFAM) {
|
|
if (!AMDGPULowerVGPREncoding().run(MF))
|
|
return PreservedAnalyses::all();
|
|
|
|
PreservedAnalyses PA;
|
|
PA.preserveSet<CFGAnalyses>();
|
|
return PA;
|
|
}
|