
Fixes: ``` *** Bad machine code: Using an undefined physical register *** - function: widget - basic block: %bb.0 bb (0x564092cbe140) - instruction: $vgpr63 = V_ACCVGPR_READ_B32_e64 killed $agpr13, implicit $exec - operand 1: killed $agpr13 LLVM ERROR: Found 1 machine code errors. ``` The detailed sequence of events that led to this assert: 1. MachineVerifier fails because `$agpr13` is not defined on line 19 below: ``` 1: bb.0.bb: 2: successors: %bb.1(0x80000000); %bb.1(100.00%) 3: liveins: $agpr14, $agpr15, $sgpr12, $sgpr13, $sgpr14, \ 4: $sgpr15, $sgpr30, $sgpr31, $sgpr34, $sgpr35, \ 5: $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr48, \ 6: $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, \ 7: $sgpr54, $sgpr55, $sgpr64, $sgpr65, $sgpr66, \ 8: $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, \ 9: $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, \ 10: $sgpr85, $sgpr86, $sgpr87, $sgpr96, $sgpr97, \ 11: $sgpr98, $sgpr99, $vgpr0, $vgpr31, $vgpr40, $vgpr41, \ 12: $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, \ 13: $sgpr10_sgpr11 14: $sgpr16 = COPY $sgpr33 15: $sgpr33 = frame-setup COPY $sgpr32 16: $sgpr18_sgpr19 = S_XOR_SAVEEXEC_B64 -1, \ 17: implicit-def $exec, implicit-def dead $scc, \ 18: implicit $exec 19: $vgpr63 = V_ACCVGPR_READ_B32_e64 killed $agpr13, \ 20: implicit $exec 21: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, \ 22: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, \ 23: implicit $exec :: (store (s32) into %stack.38, \ 24: addrspace 5) 25: ... 26: $vgpr43 = IMPLICIT_DEF 27: $vgpr43 = SI_SPILL_S32_TO_VGPR $sgpr15, 0, \ 28: killed $vgpr43(tied-def 0) 29: $vgpr43 = SI_SPILL_S32_TO_VGPR $sgpr14, 1, \ 30: killed $vgpr43(tied-def 0) 31: $sgpr100_sgpr101 = S_OR_SAVEEXEC_B64 -1, \ 32: implicit-def $exec, implicit-def dead $scc, \ 33: implicit $exec 34: renamable $agpr13 = COPY killed $vgpr43, implicit $exec ``` 2. That instruction is created by [`emitCSRSpillStores`](d599bdeaa4/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (L977)
) (called by [`SIFrameLowering::emitPrologue`](d599bdeaa4/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp (L1122)
)) because `$agpr13` is in `WWMSpills`. See lines 982, 998, and 993 below. ``` 977: // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch 978: // registers. However, save all lanes of callee-saved VGPRs. Due to this, we 979: // might end up flipping the EXEC bits twice. 980: Register ScratchExecCopy; 981: SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs; 982: FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs); 983: if (!WWMScratchRegs.empty()) 984: ScratchExecCopy = 985: buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, 986: /*IsProlog*/ true, /*EnableInactiveLanes*/ true); 987: 988: auto StoreWWMRegisters = 989: [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) { 990: for (const auto &Reg : WWMRegs) { 991: Register VGPR = Reg.first; 992: int FI = Reg.second; 993: buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL, 994: VGPR, FI, FrameReg); 995: } 996: }; 997: 998: StoreWWMRegisters(WWMScratchRegs); ``` 3. `$agpr13` got added to `WWMSpills` by [`SILowerWWMCopies::run`](59a7185dd9/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp (L137)
) as it processed the `WWM_COPY` on line 3 below (corresponds to line 34 above in point #_1_): ``` 1: %45:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr15, 0, %45:vgpr_32(tied-def 0) 2: %45:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr14, 1, %45:vgpr_32(tied-def 0) 3: %44:av_32 = WWM_COPY %45:vgpr_32 ```
2196 lines
87 KiB
C++
2196 lines
87 KiB
C++
//===----------------------- SIFrameLowering.cpp --------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//==-----------------------------------------------------------------------===//
|
|
|
|
#include "SIFrameLowering.h"
|
|
#include "AMDGPU.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
#include "SIMachineFunctionInfo.h"
|
|
#include "llvm/CodeGen/LiveRegUnits.h"
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
#include "llvm/CodeGen/RegisterScavenging.h"
|
|
#include "llvm/Target/TargetMachine.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "frame-info"
|
|
|
|
static cl::opt<bool> EnableSpillVGPRToAGPR(
|
|
"amdgpu-spill-vgpr-to-agpr",
|
|
cl::desc("Enable spilling VGPRs to AGPRs"),
|
|
cl::ReallyHidden,
|
|
cl::init(true));
|
|
|
|
// Find a register matching \p RC from \p LiveUnits which is unused and
|
|
// available throughout the function. On failure, returns AMDGPU::NoRegister.
|
|
// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
|
|
// MCRegisters. This should reduce the number of iterations and avoid redundant
|
|
// checking.
|
|
static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
|
|
const LiveRegUnits &LiveUnits,
|
|
const TargetRegisterClass &RC) {
|
|
for (MCRegister Reg : RC) {
|
|
if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
|
|
!MRI.isReserved(Reg))
|
|
return Reg;
|
|
}
|
|
return MCRegister();
|
|
}
|
|
|
|
// Find a scratch register that we can use in the prologue. We avoid using
|
|
// callee-save registers since they may appear to be free when this is called
|
|
// from canUseAsPrologue (during shrink wrapping), but then no longer be free
|
|
// when this is called from emitPrologue.
|
|
static MCRegister findScratchNonCalleeSaveRegister(
|
|
MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
|
|
const TargetRegisterClass &RC, bool Unused = false) {
|
|
// Mark callee saved registers as used so we will not choose them.
|
|
const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
|
|
for (unsigned i = 0; CSRegs[i]; ++i)
|
|
LiveUnits.addReg(CSRegs[i]);
|
|
|
|
// We are looking for a register that can be used throughout the entire
|
|
// function, so any use is unacceptable.
|
|
if (Unused)
|
|
return findUnusedRegister(MRI, LiveUnits, RC);
|
|
|
|
for (MCRegister Reg : RC) {
|
|
if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
|
|
return Reg;
|
|
}
|
|
|
|
return MCRegister();
|
|
}
|
|
|
|
/// Query target location for spilling SGPRs
|
|
/// \p IncludeScratchCopy : Also look for free scratch SGPRs
|
|
static void getVGPRSpillLaneOrTempRegister(
|
|
MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
|
|
const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
|
|
bool IncludeScratchCopy = true) {
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
|
unsigned Size = TRI->getSpillSize(RC);
|
|
Align Alignment = TRI->getSpillAlign(RC);
|
|
|
|
// We need to save and restore the given SGPR.
|
|
|
|
Register ScratchSGPR;
|
|
// 1: Try to save the given register into an unused scratch SGPR. The
|
|
// LiveUnits should have all the callee saved registers marked as used. For
|
|
// certain cases we skip copy to scratch SGPR.
|
|
if (IncludeScratchCopy)
|
|
ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
|
|
|
|
if (!ScratchSGPR) {
|
|
int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
|
|
TargetStackID::SGPRSpill);
|
|
|
|
if (TRI->spillSGPRToVGPR() &&
|
|
MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
|
|
/*IsPrologEpilog=*/true)) {
|
|
// 2: There's no free lane to spill, and no free register to save the
|
|
// SGPR, so we're forced to take another VGPR to use for the spill.
|
|
MFI->addToPrologEpilogSGPRSpills(
|
|
SGPR, PrologEpilogSGPRSaveRestoreInfo(
|
|
SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
|
|
|
|
LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
|
|
dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
|
|
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
|
|
<< '\n';);
|
|
} else {
|
|
// Remove dead <FI> index
|
|
MF.getFrameInfo().RemoveStackObject(FI);
|
|
// 3: If all else fails, spill the register to memory.
|
|
FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
|
|
MFI->addToPrologEpilogSGPRSpills(
|
|
SGPR,
|
|
PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
|
|
LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
|
|
<< printReg(SGPR, TRI) << '\n');
|
|
}
|
|
} else {
|
|
MFI->addToPrologEpilogSGPRSpills(
|
|
SGPR, PrologEpilogSGPRSaveRestoreInfo(
|
|
SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
|
|
LiveUnits.addReg(ScratchSGPR);
|
|
LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
|
|
<< printReg(ScratchSGPR, TRI) << '\n');
|
|
}
|
|
}
|
|
|
|
// We need to specially emit stack operations here because a different frame
|
|
// register is used than in the rest of the function, as getFrameRegister would
|
|
// use.
|
|
static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
|
|
const SIMachineFunctionInfo &FuncInfo,
|
|
LiveRegUnits &LiveUnits, MachineFunction &MF,
|
|
MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator I, const DebugLoc &DL,
|
|
Register SpillReg, int FI, Register FrameReg,
|
|
int64_t DwordOff = 0) {
|
|
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
|
|
: AMDGPU::BUFFER_STORE_DWORD_OFFSET;
|
|
|
|
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
|
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
|
|
FrameInfo.getObjectAlign(FI));
|
|
LiveUnits.addReg(SpillReg);
|
|
bool IsKill = !MBB.isLiveIn(SpillReg);
|
|
TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
|
|
DwordOff, MMO, nullptr, &LiveUnits);
|
|
if (IsKill)
|
|
LiveUnits.removeReg(SpillReg);
|
|
}
|
|
|
|
static void buildEpilogRestore(const GCNSubtarget &ST,
|
|
const SIRegisterInfo &TRI,
|
|
const SIMachineFunctionInfo &FuncInfo,
|
|
LiveRegUnits &LiveUnits, MachineFunction &MF,
|
|
MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator I,
|
|
const DebugLoc &DL, Register SpillReg, int FI,
|
|
Register FrameReg, int64_t DwordOff = 0) {
|
|
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
|
|
: AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
|
|
|
|
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
|
MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
|
|
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
|
PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
|
|
FrameInfo.getObjectAlign(FI));
|
|
TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
|
|
DwordOff, MMO, nullptr, &LiveUnits);
|
|
}
|
|
|
|
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
|
|
const DebugLoc &DL, const SIInstrInfo *TII,
|
|
Register TargetReg) {
|
|
MachineFunction *MF = MBB.getParent();
|
|
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
|
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
|
|
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
|
|
Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
|
|
Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
|
|
|
|
if (MFI->getGITPtrHigh() != 0xffffffff) {
|
|
BuildMI(MBB, I, DL, SMovB32, TargetHi)
|
|
.addImm(MFI->getGITPtrHigh())
|
|
.addReg(TargetReg, RegState::ImplicitDefine);
|
|
} else {
|
|
const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
|
|
BuildMI(MBB, I, DL, GetPC64, TargetReg);
|
|
}
|
|
Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
|
|
MF->getRegInfo().addLiveIn(GitPtrLo);
|
|
MBB.addLiveIn(GitPtrLo);
|
|
BuildMI(MBB, I, DL, SMovB32, TargetLo)
|
|
.addReg(GitPtrLo);
|
|
}
|
|
|
|
static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
|
|
const SIMachineFunctionInfo *FuncInfo,
|
|
MachineFunction &MF, MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI, bool IsProlog) {
|
|
if (LiveUnits.empty()) {
|
|
LiveUnits.init(TRI);
|
|
if (IsProlog) {
|
|
LiveUnits.addLiveIns(MBB);
|
|
} else {
|
|
// In epilog.
|
|
LiveUnits.addLiveOuts(MBB);
|
|
LiveUnits.stepBackward(*MBBI);
|
|
}
|
|
}
|
|
}
|
|
|
|
namespace llvm {
|
|
|
|
// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
|
|
// BP, etc. These spills are delayed until the current function's frame is
|
|
// finalized. For a given register, the builder uses the
|
|
// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
|
|
class PrologEpilogSGPRSpillBuilder {
|
|
MachineBasicBlock::iterator MI;
|
|
MachineBasicBlock &MBB;
|
|
MachineFunction &MF;
|
|
const GCNSubtarget &ST;
|
|
MachineFrameInfo &MFI;
|
|
SIMachineFunctionInfo *FuncInfo;
|
|
const SIInstrInfo *TII;
|
|
const SIRegisterInfo &TRI;
|
|
Register SuperReg;
|
|
const PrologEpilogSGPRSaveRestoreInfo SI;
|
|
LiveRegUnits &LiveUnits;
|
|
const DebugLoc &DL;
|
|
Register FrameReg;
|
|
ArrayRef<int16_t> SplitParts;
|
|
unsigned NumSubRegs;
|
|
unsigned EltSize = 4;
|
|
|
|
void saveToMemory(const int FI) const {
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
assert(!MFI.isDeadObjectIndex(FI));
|
|
|
|
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
|
|
|
|
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
|
|
MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
|
|
if (!TmpVGPR)
|
|
report_fatal_error("failed to find free scratch register");
|
|
|
|
for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
|
|
Register SubReg = NumSubRegs == 1
|
|
? SuperReg
|
|
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
|
|
.addReg(SubReg);
|
|
|
|
buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
|
|
FI, FrameReg, DwordOff);
|
|
DwordOff += 4;
|
|
}
|
|
}
|
|
|
|
void saveToVGPRLane(const int FI) const {
|
|
assert(!MFI.isDeadObjectIndex(FI));
|
|
|
|
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
|
|
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
|
|
FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
|
|
assert(Spill.size() == NumSubRegs);
|
|
|
|
for (unsigned I = 0; I < NumSubRegs; ++I) {
|
|
Register SubReg = NumSubRegs == 1
|
|
? SuperReg
|
|
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
|
|
Spill[I].VGPR)
|
|
.addReg(SubReg)
|
|
.addImm(Spill[I].Lane)
|
|
.addReg(Spill[I].VGPR, RegState::Undef);
|
|
}
|
|
}
|
|
|
|
void copyToScratchSGPR(Register DstReg) const {
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
|
|
.addReg(SuperReg)
|
|
.setMIFlag(MachineInstr::FrameSetup);
|
|
}
|
|
|
|
void restoreFromMemory(const int FI) {
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
|
|
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
|
|
MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
|
|
if (!TmpVGPR)
|
|
report_fatal_error("failed to find free scratch register");
|
|
|
|
for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
|
|
Register SubReg = NumSubRegs == 1
|
|
? SuperReg
|
|
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
|
|
|
|
buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
|
|
TmpVGPR, FI, FrameReg, DwordOff);
|
|
MRI.constrainRegClass(SubReg, &AMDGPU::SReg_32_XM0RegClass);
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
|
|
.addReg(TmpVGPR, RegState::Kill);
|
|
DwordOff += 4;
|
|
}
|
|
}
|
|
|
|
void restoreFromVGPRLane(const int FI) {
|
|
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
|
|
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
|
|
FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
|
|
assert(Spill.size() == NumSubRegs);
|
|
|
|
for (unsigned I = 0; I < NumSubRegs; ++I) {
|
|
Register SubReg = NumSubRegs == 1
|
|
? SuperReg
|
|
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
|
|
.addReg(Spill[I].VGPR)
|
|
.addImm(Spill[I].Lane);
|
|
}
|
|
}
|
|
|
|
void copyFromScratchSGPR(Register SrcReg) const {
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
|
|
.addReg(SrcReg)
|
|
.setMIFlag(MachineInstr::FrameDestroy);
|
|
}
|
|
|
|
public:
|
|
PrologEpilogSGPRSpillBuilder(Register Reg,
|
|
const PrologEpilogSGPRSaveRestoreInfo SI,
|
|
MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MI,
|
|
const DebugLoc &DL, const SIInstrInfo *TII,
|
|
const SIRegisterInfo &TRI,
|
|
LiveRegUnits &LiveUnits, Register FrameReg)
|
|
: MI(MI), MBB(MBB), MF(*MBB.getParent()),
|
|
ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
|
|
FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
|
|
SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
|
|
FrameReg(FrameReg) {
|
|
const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
|
|
SplitParts = TRI.getRegSplitParts(RC, EltSize);
|
|
NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
|
|
|
|
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
|
|
}
|
|
|
|
void save() {
|
|
switch (SI.getKind()) {
|
|
case SGPRSaveKind::SPILL_TO_MEM:
|
|
return saveToMemory(SI.getIndex());
|
|
case SGPRSaveKind::SPILL_TO_VGPR_LANE:
|
|
return saveToVGPRLane(SI.getIndex());
|
|
case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
|
|
return copyToScratchSGPR(SI.getReg());
|
|
}
|
|
}
|
|
|
|
void restore() {
|
|
switch (SI.getKind()) {
|
|
case SGPRSaveKind::SPILL_TO_MEM:
|
|
return restoreFromMemory(SI.getIndex());
|
|
case SGPRSaveKind::SPILL_TO_VGPR_LANE:
|
|
return restoreFromVGPRLane(SI.getIndex());
|
|
case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
|
|
return copyFromScratchSGPR(SI.getReg());
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace llvm
|
|
|
|
// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
|
|
void SIFrameLowering::emitEntryFunctionFlatScratchInit(
|
|
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
|
|
const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
// We don't need this if we only have spills since there is no user facing
|
|
// scratch.
|
|
|
|
// TODO: If we know we don't have flat instructions earlier, we can omit
|
|
// this from the input registers.
|
|
//
|
|
// TODO: We only need to know if we access scratch space through a flat
|
|
// pointer. Because we only detect if flat instructions are used at all,
|
|
// this will be used more often than necessary on VI.
|
|
|
|
Register FlatScrInitLo;
|
|
Register FlatScrInitHi;
|
|
|
|
if (ST.isAmdPalOS()) {
|
|
// Extract the scratch offset from the descriptor in the GIT
|
|
LiveRegUnits LiveUnits;
|
|
LiveUnits.init(*TRI);
|
|
LiveUnits.addLiveIns(MBB);
|
|
|
|
// Find unused reg to load flat scratch init into
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
Register FlatScrInit = AMDGPU::NoRegister;
|
|
ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
|
|
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
|
|
AllSGPR64s = AllSGPR64s.slice(
|
|
std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
|
|
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
|
|
for (MCPhysReg Reg : AllSGPR64s) {
|
|
if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
|
|
MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
|
|
FlatScrInit = Reg;
|
|
break;
|
|
}
|
|
}
|
|
assert(FlatScrInit && "Failed to find free register for scratch init");
|
|
|
|
FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
|
|
FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
|
|
|
|
buildGitPtr(MBB, I, DL, TII, FlatScrInit);
|
|
|
|
// We now have the GIT ptr - now get the scratch descriptor from the entry
|
|
// at offset 0 (or offset 16 for a compute shader).
|
|
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
|
const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
|
|
auto *MMO = MF.getMachineMemOperand(
|
|
PtrInfo,
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
|
|
MachineMemOperand::MODereferenceable,
|
|
8, Align(4));
|
|
unsigned Offset =
|
|
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
|
|
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
|
|
unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
|
|
BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
|
|
.addReg(FlatScrInit)
|
|
.addImm(EncodedOffset) // offset
|
|
.addImm(0) // cpol
|
|
.addMemOperand(MMO);
|
|
|
|
// Mask the offset in [47:0] of the descriptor
|
|
const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
|
|
auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
|
|
.addReg(FlatScrInitHi)
|
|
.addImm(0xffff);
|
|
And->getOperand(3).setIsDead(); // Mark SCC as dead.
|
|
} else {
|
|
Register FlatScratchInitReg =
|
|
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
|
|
assert(FlatScratchInitReg);
|
|
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
MRI.addLiveIn(FlatScratchInitReg);
|
|
MBB.addLiveIn(FlatScratchInitReg);
|
|
|
|
FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
|
|
FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
|
|
}
|
|
|
|
// Do a 64-bit pointer add.
|
|
if (ST.flatScratchIsPointer()) {
|
|
if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
|
|
.addReg(FlatScrInitLo)
|
|
.addReg(ScratchWaveOffsetReg);
|
|
auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
|
|
FlatScrInitHi)
|
|
.addReg(FlatScrInitHi)
|
|
.addImm(0);
|
|
Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
|
|
|
|
using namespace AMDGPU::Hwreg;
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
|
|
.addReg(FlatScrInitLo)
|
|
.addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
|
|
.addReg(FlatScrInitHi)
|
|
.addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
|
|
return;
|
|
}
|
|
|
|
// For GFX9.
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
|
|
.addReg(FlatScrInitLo)
|
|
.addReg(ScratchWaveOffsetReg);
|
|
auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
|
|
AMDGPU::FLAT_SCR_HI)
|
|
.addReg(FlatScrInitHi)
|
|
.addImm(0);
|
|
Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
|
|
|
|
return;
|
|
}
|
|
|
|
assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
|
|
|
|
// Copy the size in bytes.
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
|
|
.addReg(FlatScrInitHi, RegState::Kill);
|
|
|
|
// Add wave offset in bytes to private base offset.
|
|
// See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
|
|
.addReg(FlatScrInitLo)
|
|
.addReg(ScratchWaveOffsetReg);
|
|
|
|
// Convert offset to 256-byte units.
|
|
auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
|
|
AMDGPU::FLAT_SCR_HI)
|
|
.addReg(FlatScrInitLo, RegState::Kill)
|
|
.addImm(8);
|
|
LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
|
|
}
|
|
|
|
// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
|
|
// memory. They should have been removed by now.
|
|
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
|
|
for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
|
|
I != E; ++I) {
|
|
if (!MFI.isDeadObjectIndex(I))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Shift down registers reserved for the scratch RSRC.
|
|
Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
|
|
MachineFunction &MF) const {
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
assert(MFI->isEntryFunction());
|
|
|
|
Register ScratchRsrcReg = MFI->getScratchRSrcReg();
|
|
|
|
if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
|
|
allStackObjectsAreDead(MF.getFrameInfo())))
|
|
return Register();
|
|
|
|
if (ST.hasSGPRInitBug() ||
|
|
ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
|
|
return ScratchRsrcReg;
|
|
|
|
// We reserved the last registers for this. Shift it down to the end of those
|
|
// which were actually used.
|
|
//
|
|
// FIXME: It might be safer to use a pseudoregister before replacement.
|
|
|
|
// FIXME: We should be able to eliminate unused input registers. We only
|
|
// cannot do this for the resources required for scratch access. For now we
|
|
// skip over user SGPRs and may leave unused holes.
|
|
|
|
unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
|
|
ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
|
|
AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
|
|
|
|
// Skip the last N reserved elements because they should have already been
|
|
// reserved for VCC etc.
|
|
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
|
|
for (MCPhysReg Reg : AllSGPR128s) {
|
|
// Pick the first unallocated one. Make sure we don't clobber the other
|
|
// reserved input we needed. Also for PAL, make sure we don't clobber
|
|
// the GIT pointer passed in SGPR0 or SGPR8.
|
|
if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
|
|
(!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
|
|
MRI.replaceRegWith(ScratchRsrcReg, Reg);
|
|
MFI->setScratchRSrcReg(Reg);
|
|
MRI.reserveReg(Reg, TRI);
|
|
return Reg;
|
|
}
|
|
}
|
|
|
|
return ScratchRsrcReg;
|
|
}
|
|
|
|
static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
|
|
return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
|
|
}
|
|
|
|
void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
|
|
MachineBasicBlock &MBB) const {
|
|
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
|
|
|
|
// FIXME: If we only have SGPR spills, we won't actually be using scratch
|
|
// memory since these spill to VGPRs. We should be cleaning up these unused
|
|
// SGPR spill frame indices somewhere.
|
|
|
|
// FIXME: We still have implicit uses on SGPR spill instructions in case they
|
|
// need to spill to vector memory. It's likely that will not happen, but at
|
|
// this point it appears we need the setup. This part of the prolog should be
|
|
// emitted after frame indices are eliminated.
|
|
|
|
// FIXME: Remove all of the isPhysRegUsed checks
|
|
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
const Function &F = MF.getFunction();
|
|
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
|
|
|
assert(MFI->isEntryFunction());
|
|
|
|
Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
|
|
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
|
|
|
|
// We need to do the replacement of the private segment buffer register even
|
|
// if there are no stack objects. There could be stores to undef or a
|
|
// constant without an associated object.
|
|
//
|
|
// This will return `Register()` in cases where there are no actual
|
|
// uses of the SRSRC.
|
|
Register ScratchRsrcReg;
|
|
if (!ST.enableFlatScratch())
|
|
ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
|
|
|
|
// Make the selected register live throughout the function.
|
|
if (ScratchRsrcReg) {
|
|
for (MachineBasicBlock &OtherBB : MF) {
|
|
if (&OtherBB != &MBB) {
|
|
OtherBB.addLiveIn(ScratchRsrcReg);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Now that we have fixed the reserved SRSRC we need to locate the
|
|
// (potentially) preloaded SRSRC.
|
|
Register PreloadedScratchRsrcReg;
|
|
if (ST.isAmdHsaOrMesa(F)) {
|
|
PreloadedScratchRsrcReg =
|
|
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
|
|
if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
|
|
// We added live-ins during argument lowering, but since they were not
|
|
// used they were deleted. We're adding the uses now, so add them back.
|
|
MRI.addLiveIn(PreloadedScratchRsrcReg);
|
|
MBB.addLiveIn(PreloadedScratchRsrcReg);
|
|
}
|
|
}
|
|
|
|
// Debug location must be unknown since the first debug location is used to
|
|
// determine the end of the prologue.
|
|
DebugLoc DL;
|
|
MachineBasicBlock::iterator I = MBB.begin();
|
|
|
|
// We found the SRSRC first because it needs four registers and has an
|
|
// alignment requirement. If the SRSRC that we found is clobbering with
|
|
// the scratch wave offset, which may be in a fixed SGPR or a free SGPR
|
|
// chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
|
|
// wave offset to a free SGPR.
|
|
Register ScratchWaveOffsetReg;
|
|
if (PreloadedScratchWaveOffsetReg &&
|
|
TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
|
|
ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
|
|
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
|
|
AllSGPRs = AllSGPRs.slice(
|
|
std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
|
|
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
|
|
for (MCPhysReg Reg : AllSGPRs) {
|
|
if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
|
|
!TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
|
|
ScratchWaveOffsetReg = Reg;
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
|
|
.addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// FIXME: We can spill incoming arguments and restore at the end of the
|
|
// prolog.
|
|
if (!ScratchWaveOffsetReg)
|
|
report_fatal_error(
|
|
"could not find temporary scratch offset register in prolog");
|
|
} else {
|
|
ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
|
|
}
|
|
assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
|
|
|
|
unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST);
|
|
if (!mayReserveScratchForCWSR(MF)) {
|
|
if (hasFP(MF)) {
|
|
Register FPReg = MFI->getFrameOffsetReg();
|
|
assert(FPReg != AMDGPU::FP_REG);
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
|
|
}
|
|
|
|
if (requiresStackPointerReference(MF)) {
|
|
Register SPReg = MFI->getStackPtrOffsetReg();
|
|
assert(SPReg != AMDGPU::SP_REG);
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset);
|
|
}
|
|
} else {
|
|
// We need to check if we're on a compute queue - if we are, then the CWSR
|
|
// trap handler may need to store some VGPRs on the stack. The first VGPR
|
|
// block is saved separately, so we only need to allocate space for any
|
|
// additional VGPR blocks used. For now, we will make sure there's enough
|
|
// room for the theoretical maximum number of VGPRs that can be allocated.
|
|
// FIXME: Figure out if the shader uses fewer VGPRs in practice.
|
|
assert(hasFP(MF));
|
|
Register FPReg = MFI->getFrameOffsetReg();
|
|
assert(FPReg != AMDGPU::FP_REG);
|
|
unsigned VGPRSize = llvm::alignTo(
|
|
(ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) -
|
|
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST,
|
|
MFI->getDynamicVGPRBlockSize())) *
|
|
4,
|
|
FrameInfo.getMaxAlign());
|
|
MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
|
|
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
|
|
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(
|
|
AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
|
|
// The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
|
|
// (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
|
|
// SCC, so we need to check for 0 manually.
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
|
|
if (requiresStackPointerReference(MF)) {
|
|
Register SPReg = MFI->getStackPtrOffsetReg();
|
|
assert(SPReg != AMDGPU::SP_REG);
|
|
|
|
// If at least one of the constants can be inlined, then we can use
|
|
// s_cselect. Otherwise, use a mov and cmovk.
|
|
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm()) ||
|
|
AMDGPU::isInlinableLiteral32(Offset + VGPRSize,
|
|
ST.hasInv2PiInlineImm())) {
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CSELECT_B32), SPReg)
|
|
.addImm(Offset + VGPRSize)
|
|
.addImm(Offset);
|
|
} else {
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset);
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), SPReg)
|
|
.addImm(Offset + VGPRSize);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool NeedsFlatScratchInit =
|
|
MFI->getUserSGPRInfo().hasFlatScratchInit() &&
|
|
(MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
|
|
(!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
|
|
|
|
if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
|
|
PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
|
|
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
|
|
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
|
|
}
|
|
|
|
if (NeedsFlatScratchInit) {
|
|
emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
|
|
}
|
|
|
|
if (ScratchRsrcReg) {
|
|
emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
|
|
PreloadedScratchRsrcReg,
|
|
ScratchRsrcReg, ScratchWaveOffsetReg);
|
|
}
|
|
}
|
|
|
|
// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
|
|
void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
|
|
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
|
|
const DebugLoc &DL, Register PreloadedScratchRsrcReg,
|
|
Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
const Function &Fn = MF.getFunction();
|
|
|
|
if (ST.isAmdPalOS()) {
|
|
// The pointer to the GIT is formed from the offset passed in and either
|
|
// the amdgpu-git-ptr-high function attribute or the top part of the PC
|
|
Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
|
|
Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
|
|
|
|
buildGitPtr(MBB, I, DL, TII, Rsrc01);
|
|
|
|
// We now have the GIT ptr - now get the scratch descriptor from the entry
|
|
// at offset 0 (or offset 16 for a compute shader).
|
|
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
|
const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
|
|
auto *MMO = MF.getMachineMemOperand(
|
|
PtrInfo,
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
|
|
MachineMemOperand::MODereferenceable,
|
|
16, Align(4));
|
|
unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
|
|
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
|
|
unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
|
|
BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
|
|
.addReg(Rsrc01)
|
|
.addImm(EncodedOffset) // offset
|
|
.addImm(0) // cpol
|
|
.addReg(ScratchRsrcReg, RegState::ImplicitDefine)
|
|
.addMemOperand(MMO);
|
|
|
|
// The driver will always set the SRD for wave 64 (bits 118:117 of
|
|
// descriptor / bits 22:21 of third sub-reg will be 0b11)
|
|
// If the shader is actually wave32 we have to modify the const_index_stride
|
|
// field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
|
|
// reason the driver does this is that there can be cases where it presents
|
|
// 2 shaders with different wave size (e.g. VsFs).
|
|
// TODO: convert to using SCRATCH instructions or multiple SRD buffers
|
|
if (ST.isWave32()) {
|
|
const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
|
|
BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
|
|
.addImm(21)
|
|
.addReg(Rsrc03);
|
|
}
|
|
} else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
|
|
assert(!ST.isAmdHsaOrMesa(Fn));
|
|
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
|
|
|
|
Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
|
|
Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
|
|
|
|
// Use relocations to get the pointer, and setup the other bits manually.
|
|
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
|
|
|
|
if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
|
|
Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
|
|
|
|
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
|
|
const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
|
|
|
|
BuildMI(MBB, I, DL, Mov64, Rsrc01)
|
|
.addReg(MFI->getImplicitBufferPtrUserSGPR())
|
|
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
|
} else {
|
|
const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
|
|
|
|
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
|
|
auto *MMO = MF.getMachineMemOperand(
|
|
PtrInfo,
|
|
MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
|
|
MachineMemOperand::MODereferenceable,
|
|
8, Align(4));
|
|
BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
|
|
.addReg(MFI->getImplicitBufferPtrUserSGPR())
|
|
.addImm(0) // offset
|
|
.addImm(0) // cpol
|
|
.addMemOperand(MMO)
|
|
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
|
|
|
MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
|
|
MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
|
|
}
|
|
} else {
|
|
Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
|
|
Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
|
|
|
|
BuildMI(MBB, I, DL, SMovB32, Rsrc0)
|
|
.addExternalSymbol("SCRATCH_RSRC_DWORD0")
|
|
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
|
|
|
BuildMI(MBB, I, DL, SMovB32, Rsrc1)
|
|
.addExternalSymbol("SCRATCH_RSRC_DWORD1")
|
|
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
|
}
|
|
|
|
BuildMI(MBB, I, DL, SMovB32, Rsrc2)
|
|
.addImm(Lo_32(Rsrc23))
|
|
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
|
|
|
BuildMI(MBB, I, DL, SMovB32, Rsrc3)
|
|
.addImm(Hi_32(Rsrc23))
|
|
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
|
} else if (ST.isAmdHsaOrMesa(Fn)) {
|
|
assert(PreloadedScratchRsrcReg);
|
|
|
|
if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
|
|
.addReg(PreloadedScratchRsrcReg, RegState::Kill);
|
|
}
|
|
}
|
|
|
|
// Add the scratch wave offset into the scratch RSRC.
|
|
//
|
|
// We only want to update the first 48 bits, which is the base address
|
|
// pointer, without touching the adjacent 16 bits of flags. We know this add
|
|
// cannot carry-out from bit 47, otherwise the scratch allocation would be
|
|
// impossible to fit in the 48-bit global address space.
|
|
//
|
|
// TODO: Evaluate if it is better to just construct an SRD using the flat
|
|
// scratch init and some constants rather than update the one we are passed.
|
|
Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
|
|
Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
|
|
|
|
// We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
|
|
// the kernel body via inreg arguments.
|
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
|
|
.addReg(ScratchRsrcSub0)
|
|
.addReg(ScratchWaveOffsetReg)
|
|
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
|
auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
|
|
.addReg(ScratchRsrcSub1)
|
|
.addImm(0)
|
|
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
|
|
Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
|
|
}
|
|
|
|
bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
|
|
switch (ID) {
|
|
case TargetStackID::Default:
|
|
case TargetStackID::NoAlloc:
|
|
case TargetStackID::SGPRSpill:
|
|
return true;
|
|
case TargetStackID::ScalableVector:
|
|
case TargetStackID::WasmLocal:
|
|
return false;
|
|
}
|
|
llvm_unreachable("Invalid TargetStackID::Value");
|
|
}
|
|
|
|
// Activate only the inactive lanes when \p EnableInactiveLanes is true.
|
|
// Otherwise, activate all lanes. It returns the saved exec.
|
|
static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
|
|
MachineFunction &MF,
|
|
MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
const DebugLoc &DL, bool IsProlog,
|
|
bool EnableInactiveLanes) {
|
|
Register ScratchExecCopy;
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
|
|
|
|
if (FuncInfo->isWholeWaveFunction()) {
|
|
// Whole wave functions already have a copy of the original EXEC mask that
|
|
// we can use.
|
|
assert(IsProlog && "Epilog should look at return, not setup");
|
|
ScratchExecCopy =
|
|
TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg();
|
|
assert(ScratchExecCopy && "Couldn't find copy of EXEC");
|
|
} else {
|
|
ScratchExecCopy = findScratchNonCalleeSaveRegister(
|
|
MRI, LiveUnits, *TRI.getWaveMaskRegClass());
|
|
}
|
|
|
|
if (!ScratchExecCopy)
|
|
report_fatal_error("failed to find free scratch register");
|
|
|
|
LiveUnits.addReg(ScratchExecCopy);
|
|
|
|
const unsigned SaveExecOpc =
|
|
ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
|
|
: AMDGPU::S_OR_SAVEEXEC_B32)
|
|
: (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
|
|
: AMDGPU::S_OR_SAVEEXEC_B64);
|
|
auto SaveExec =
|
|
BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
|
|
SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
|
|
|
|
return ScratchExecCopy;
|
|
}
|
|
|
|
void SIFrameLowering::emitCSRSpillStores(
|
|
MachineFunction &MF, MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
|
|
Register FrameReg, Register FramePtrRegScratchCopy) const {
|
|
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
// Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
|
|
// registers. However, save all lanes of callee-saved VGPRs. Due to this, we
|
|
// might end up flipping the EXEC bits twice.
|
|
Register ScratchExecCopy;
|
|
SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
|
|
FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
|
|
if (!WWMScratchRegs.empty())
|
|
ScratchExecCopy =
|
|
buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
|
|
/*IsProlog*/ true, /*EnableInactiveLanes*/ true);
|
|
|
|
auto StoreWWMRegisters =
|
|
[&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
|
|
for (const auto &Reg : WWMRegs) {
|
|
Register VGPR = Reg.first;
|
|
int FI = Reg.second;
|
|
buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
|
|
VGPR, FI, FrameReg);
|
|
}
|
|
};
|
|
|
|
for (const Register Reg : make_first_range(WWMScratchRegs)) {
|
|
if (!MRI.isReserved(Reg)) {
|
|
MRI.addLiveIn(Reg);
|
|
MBB.addLiveIn(Reg);
|
|
}
|
|
}
|
|
StoreWWMRegisters(WWMScratchRegs);
|
|
|
|
auto EnableAllLanes = [&]() {
|
|
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
|
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
|
|
};
|
|
|
|
if (!WWMCalleeSavedRegs.empty()) {
|
|
if (ScratchExecCopy) {
|
|
EnableAllLanes();
|
|
} else {
|
|
ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
|
|
/*IsProlog*/ true,
|
|
/*EnableInactiveLanes*/ false);
|
|
}
|
|
}
|
|
|
|
StoreWWMRegisters(WWMCalleeSavedRegs);
|
|
if (FuncInfo->isWholeWaveFunction()) {
|
|
// SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose, so we can remove
|
|
// it now. If we have already saved some WWM CSR registers, then the EXEC is
|
|
// already -1 and we don't need to do anything else. Otherwise, set EXEC to
|
|
// -1 here.
|
|
if (!ScratchExecCopy)
|
|
buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
|
|
/*EnableInactiveLanes*/ true);
|
|
else if (WWMCalleeSavedRegs.empty())
|
|
EnableAllLanes();
|
|
TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
|
|
} else if (ScratchExecCopy) {
|
|
// FIXME: Split block and make terminator.
|
|
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
|
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
|
|
.addReg(ScratchExecCopy, RegState::Kill);
|
|
LiveUnits.addReg(ScratchExecCopy);
|
|
}
|
|
|
|
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
|
|
|
|
for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
|
|
// Special handle FP spill:
|
|
// Skip if FP is saved to a scratch SGPR, the save has already been emitted.
|
|
// Otherwise, FP has been moved to a temporary register and spill it
|
|
// instead.
|
|
Register Reg =
|
|
Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
|
|
if (!Reg)
|
|
continue;
|
|
|
|
PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
|
|
LiveUnits, FrameReg);
|
|
SB.save();
|
|
}
|
|
|
|
// If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
|
|
// such scratch registers live throughout the function.
|
|
SmallVector<Register, 1> ScratchSGPRs;
|
|
FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
|
|
if (!ScratchSGPRs.empty()) {
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
for (MCPhysReg Reg : ScratchSGPRs)
|
|
MBB.addLiveIn(Reg);
|
|
|
|
MBB.sortUniqueLiveIns();
|
|
}
|
|
if (!LiveUnits.empty()) {
|
|
for (MCPhysReg Reg : ScratchSGPRs)
|
|
LiveUnits.addReg(Reg);
|
|
}
|
|
}
|
|
}
|
|
|
|
void SIFrameLowering::emitCSRSpillRestores(
|
|
MachineFunction &MF, MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
|
|
Register FrameReg, Register FramePtrRegScratchCopy) const {
|
|
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
|
|
|
|
for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
|
|
// Special handle FP restore:
|
|
// Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
|
|
// the FP value to a temporary register. The frame pointer should be
|
|
// overwritten only at the end when all other spills are restored from
|
|
// current frame.
|
|
Register Reg =
|
|
Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
|
|
if (!Reg)
|
|
continue;
|
|
|
|
PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
|
|
LiveUnits, FrameReg);
|
|
SB.restore();
|
|
}
|
|
|
|
// Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
|
|
// scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
|
|
// this, we might end up flipping the EXEC bits twice.
|
|
Register ScratchExecCopy;
|
|
SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
|
|
FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
|
|
auto RestoreWWMRegisters =
|
|
[&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
|
|
for (const auto &Reg : WWMRegs) {
|
|
Register VGPR = Reg.first;
|
|
int FI = Reg.second;
|
|
buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
|
|
VGPR, FI, FrameReg);
|
|
}
|
|
};
|
|
|
|
if (FuncInfo->isWholeWaveFunction()) {
|
|
// For whole wave functions, the EXEC is already -1 at this point.
|
|
// Therefore, we can restore the CSR WWM registers right away.
|
|
RestoreWWMRegisters(WWMCalleeSavedRegs);
|
|
|
|
// The original EXEC is the first operand of the return instruction.
|
|
const MachineInstr &Return = MBB.instr_back();
|
|
assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
|
|
"Unexpected return inst");
|
|
Register OrigExec = Return.getOperand(0).getReg();
|
|
|
|
if (!WWMScratchRegs.empty()) {
|
|
unsigned XorOpc = ST.isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
|
|
BuildMI(MBB, MBBI, DL, TII->get(XorOpc), TRI.getExec())
|
|
.addReg(OrigExec)
|
|
.addImm(-1);
|
|
RestoreWWMRegisters(WWMScratchRegs);
|
|
}
|
|
|
|
// Restore original EXEC.
|
|
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
|
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
|
|
return;
|
|
}
|
|
|
|
if (!WWMScratchRegs.empty()) {
|
|
ScratchExecCopy =
|
|
buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
|
|
/*IsProlog=*/false, /*EnableInactiveLanes=*/true);
|
|
}
|
|
RestoreWWMRegisters(WWMScratchRegs);
|
|
if (!WWMCalleeSavedRegs.empty()) {
|
|
if (ScratchExecCopy) {
|
|
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
|
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
|
|
} else {
|
|
ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
|
|
/*IsProlog*/ false,
|
|
/*EnableInactiveLanes*/ false);
|
|
}
|
|
}
|
|
|
|
RestoreWWMRegisters(WWMCalleeSavedRegs);
|
|
if (ScratchExecCopy) {
|
|
// FIXME: Split block and make terminator.
|
|
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
|
|
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
|
|
.addReg(ScratchExecCopy, RegState::Kill);
|
|
}
|
|
}
|
|
|
|
void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
|
MachineBasicBlock &MBB) const {
|
|
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
if (FuncInfo->isEntryFunction()) {
|
|
emitEntryFunctionPrologue(MF, MBB);
|
|
return;
|
|
}
|
|
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
|
|
Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
|
|
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
|
|
Register BasePtrReg =
|
|
TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
|
|
LiveRegUnits LiveUnits;
|
|
|
|
MachineBasicBlock::iterator MBBI = MBB.begin();
|
|
// DebugLoc must be unknown since the first instruction with DebugLoc is used
|
|
// to determine the end of the prologue.
|
|
DebugLoc DL;
|
|
|
|
if (FuncInfo->isChainFunction()) {
|
|
// Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
|
|
// are free to set one up if they need it.
|
|
bool UseSP = requiresStackPointerReference(MF);
|
|
if (UseSP) {
|
|
assert(StackPtrReg != AMDGPU::SP_REG);
|
|
|
|
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
|
|
.addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
|
|
}
|
|
}
|
|
|
|
bool HasFP = false;
|
|
bool HasBP = false;
|
|
uint32_t NumBytes = MFI.getStackSize();
|
|
uint32_t RoundedSize = NumBytes;
|
|
|
|
if (TRI.hasStackRealignment(MF))
|
|
HasFP = true;
|
|
|
|
Register FramePtrRegScratchCopy;
|
|
if (!HasFP && !hasFP(MF)) {
|
|
// Emit the CSR spill stores with SP base register.
|
|
emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
|
|
FuncInfo->isChainFunction() ? Register() : StackPtrReg,
|
|
FramePtrRegScratchCopy);
|
|
} else {
|
|
// CSR spill stores will use FP as base register.
|
|
Register SGPRForFPSaveRestoreCopy =
|
|
FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
|
|
|
|
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
|
|
if (SGPRForFPSaveRestoreCopy) {
|
|
// Copy FP to the scratch register now and emit the CFI entry. It avoids
|
|
// the extra FP copy needed in the other two cases when FP is spilled to
|
|
// memory or to a VGPR lane.
|
|
PrologEpilogSGPRSpillBuilder SB(
|
|
FramePtrReg,
|
|
FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
|
|
DL, TII, TRI, LiveUnits, FramePtrReg);
|
|
SB.save();
|
|
LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
|
|
} else {
|
|
// Copy FP into a new scratch register so that its previous value can be
|
|
// spilled after setting up the new frame.
|
|
FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
|
|
MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
|
|
if (!FramePtrRegScratchCopy)
|
|
report_fatal_error("failed to find free scratch register");
|
|
|
|
LiveUnits.addReg(FramePtrRegScratchCopy);
|
|
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
|
|
.addReg(FramePtrReg);
|
|
}
|
|
}
|
|
|
|
if (HasFP) {
|
|
const unsigned Alignment = MFI.getMaxAlign().value();
|
|
|
|
RoundedSize += Alignment;
|
|
if (LiveUnits.empty()) {
|
|
LiveUnits.init(TRI);
|
|
LiveUnits.addLiveIns(MBB);
|
|
}
|
|
|
|
// s_add_i32 s33, s32, NumBytes
|
|
// s_and_b32 s33, s33, 0b111...0000
|
|
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
|
|
.addReg(StackPtrReg)
|
|
.addImm((Alignment - 1) * getScratchScaleFactor(ST))
|
|
.setMIFlag(MachineInstr::FrameSetup);
|
|
auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
|
|
.addReg(FramePtrReg, RegState::Kill)
|
|
.addImm(-Alignment * getScratchScaleFactor(ST))
|
|
.setMIFlag(MachineInstr::FrameSetup);
|
|
And->getOperand(3).setIsDead(); // Mark SCC as dead.
|
|
FuncInfo->setIsStackRealigned(true);
|
|
} else if ((HasFP = hasFP(MF))) {
|
|
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
|
|
.addReg(StackPtrReg)
|
|
.setMIFlag(MachineInstr::FrameSetup);
|
|
}
|
|
|
|
// If FP is used, emit the CSR spills with FP base register.
|
|
if (HasFP) {
|
|
emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
|
|
FramePtrRegScratchCopy);
|
|
if (FramePtrRegScratchCopy)
|
|
LiveUnits.removeReg(FramePtrRegScratchCopy);
|
|
}
|
|
|
|
// If we need a base pointer, set it up here. It's whatever the value of
|
|
// the stack pointer is at this point. Any variable size objects will be
|
|
// allocated after this, so we can still use the base pointer to reference
|
|
// the incoming arguments.
|
|
if ((HasBP = TRI.hasBasePointer(MF))) {
|
|
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
|
|
.addReg(StackPtrReg)
|
|
.setMIFlag(MachineInstr::FrameSetup);
|
|
}
|
|
|
|
if (HasFP && RoundedSize != 0) {
|
|
auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
|
|
.addReg(StackPtrReg)
|
|
.addImm(RoundedSize * getScratchScaleFactor(ST))
|
|
.setMIFlag(MachineInstr::FrameSetup);
|
|
Add->getOperand(3).setIsDead(); // Mark SCC as dead.
|
|
}
|
|
|
|
bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
|
|
(void)FPSaved;
|
|
assert((!HasFP || FPSaved) &&
|
|
"Needed to save FP but didn't save it anywhere");
|
|
|
|
// If we allow spilling to AGPRs we may have saved FP but then spill
|
|
// everything into AGPRs instead of the stack.
|
|
assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
|
|
"Saved FP but didn't need it");
|
|
|
|
bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
|
|
(void)BPSaved;
|
|
assert((!HasBP || BPSaved) &&
|
|
"Needed to save BP but didn't save it anywhere");
|
|
|
|
assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
|
|
}
|
|
|
|
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
|
|
MachineBasicBlock &MBB) const {
|
|
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
if (FuncInfo->isEntryFunction())
|
|
return;
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
LiveRegUnits LiveUnits;
|
|
// Get the insert location for the epilogue. If there were no terminators in
|
|
// the block, get the last instruction.
|
|
MachineBasicBlock::iterator MBBI = MBB.end();
|
|
DebugLoc DL;
|
|
if (!MBB.empty()) {
|
|
MBBI = MBB.getLastNonDebugInstr();
|
|
if (MBBI != MBB.end())
|
|
DL = MBBI->getDebugLoc();
|
|
|
|
MBBI = MBB.getFirstTerminator();
|
|
}
|
|
|
|
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
uint32_t NumBytes = MFI.getStackSize();
|
|
uint32_t RoundedSize = FuncInfo->isStackRealigned()
|
|
? NumBytes + MFI.getMaxAlign().value()
|
|
: NumBytes;
|
|
const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
|
|
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
|
|
bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
|
|
|
|
if (RoundedSize != 0) {
|
|
if (TRI.hasBasePointer(MF)) {
|
|
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
|
|
.addReg(TRI.getBaseRegister())
|
|
.setMIFlag(MachineInstr::FrameDestroy);
|
|
} else if (hasFP(MF)) {
|
|
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
|
|
.addReg(FramePtrReg)
|
|
.setMIFlag(MachineInstr::FrameDestroy);
|
|
}
|
|
}
|
|
|
|
Register FramePtrRegScratchCopy;
|
|
Register SGPRForFPSaveRestoreCopy =
|
|
FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
|
|
if (FPSaved) {
|
|
// CSR spill restores should use FP as base register. If
|
|
// SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
|
|
// into a new scratch register and copy to FP later when other registers are
|
|
// restored from the current stack frame.
|
|
initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
|
|
if (SGPRForFPSaveRestoreCopy) {
|
|
LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
|
|
} else {
|
|
FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
|
|
MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
|
|
if (!FramePtrRegScratchCopy)
|
|
report_fatal_error("failed to find free scratch register");
|
|
|
|
LiveUnits.addReg(FramePtrRegScratchCopy);
|
|
}
|
|
|
|
emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
|
|
FramePtrRegScratchCopy);
|
|
}
|
|
|
|
if (FPSaved) {
|
|
// Insert the copy to restore FP.
|
|
Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
|
|
: FramePtrRegScratchCopy;
|
|
MachineInstrBuilder MIB =
|
|
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
|
|
.addReg(SrcReg);
|
|
if (SGPRForFPSaveRestoreCopy)
|
|
MIB.setMIFlag(MachineInstr::FrameDestroy);
|
|
} else {
|
|
// Insert the CSR spill restores with SP as the base register.
|
|
emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits,
|
|
FuncInfo->isChainFunction() ? Register() : StackPtrReg,
|
|
FramePtrRegScratchCopy);
|
|
}
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
|
|
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
|
|
I != E; ++I) {
|
|
if (!MFI.isDeadObjectIndex(I) &&
|
|
MFI.getStackID(I) == TargetStackID::SGPRSpill &&
|
|
!FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
|
|
int FI,
|
|
Register &FrameReg) const {
|
|
const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
|
|
|
|
FrameReg = RI->getFrameRegister(MF);
|
|
return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
|
|
}
|
|
|
|
void SIFrameLowering::processFunctionBeforeFrameFinalized(
|
|
MachineFunction &MF,
|
|
RegScavenger *RS) const {
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
|
|
&& EnableSpillVGPRToAGPR;
|
|
|
|
if (SpillVGPRToAGPR) {
|
|
// To track the spill frame indices handled in this pass.
|
|
BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
|
|
BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
|
|
|
|
bool SeenDbgInstr = false;
|
|
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
|
|
int FrameIndex;
|
|
if (MI.isDebugInstr())
|
|
SeenDbgInstr = true;
|
|
|
|
if (TII->isVGPRSpill(MI)) {
|
|
// Try to eliminate stack used by VGPR spills before frame
|
|
// finalization.
|
|
unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
|
|
AMDGPU::OpName::vaddr);
|
|
int FI = MI.getOperand(FIOp).getIndex();
|
|
Register VReg =
|
|
TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
|
|
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
|
|
TRI->isAGPR(MRI, VReg))) {
|
|
assert(RS != nullptr);
|
|
RS->enterBasicBlockEnd(MBB);
|
|
RS->backward(std::next(MI.getIterator()));
|
|
TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
|
|
SpillFIs.set(FI);
|
|
continue;
|
|
}
|
|
} else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
|
|
TII->isLoadFromStackSlot(MI, FrameIndex))
|
|
if (!MFI.isFixedObjectIndex(FrameIndex))
|
|
NonVGPRSpillFIs.set(FrameIndex);
|
|
}
|
|
}
|
|
|
|
// Stack slot coloring may assign different objects to the same stack slot.
|
|
// If not, then the VGPR to AGPR spill slot is dead.
|
|
for (unsigned FI : SpillFIs.set_bits())
|
|
if (!NonVGPRSpillFIs.test(FI))
|
|
FuncInfo->setVGPRToAGPRSpillDead(FI);
|
|
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
|
|
MBB.addLiveIn(Reg);
|
|
|
|
for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
|
|
MBB.addLiveIn(Reg);
|
|
|
|
MBB.sortUniqueLiveIns();
|
|
|
|
if (!SpillFIs.empty() && SeenDbgInstr) {
|
|
// FIXME: The dead frame indices are replaced with a null register from
|
|
// the debug value instructions. We should instead, update it with the
|
|
// correct register value. But not sure the register value alone is
|
|
for (MachineInstr &MI : MBB) {
|
|
if (MI.isDebugValue()) {
|
|
uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0;
|
|
if (MI.getOperand(StackOperandIdx).isFI() &&
|
|
!MFI.isFixedObjectIndex(
|
|
MI.getOperand(StackOperandIdx).getIndex()) &&
|
|
SpillFIs[MI.getOperand(StackOperandIdx).getIndex()]) {
|
|
MI.getOperand(StackOperandIdx)
|
|
.ChangeToRegister(Register(), false /*isDef*/);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// At this point we've already allocated all spilled SGPRs to VGPRs if we
|
|
// can. Any remaining SGPR spills will go to memory, so move them back to the
|
|
// default stack.
|
|
bool HaveSGPRToVMemSpill =
|
|
FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
|
|
assert(allSGPRSpillsAreDead(MF) &&
|
|
"SGPR spill should have been removed in SILowerSGPRSpills");
|
|
|
|
// FIXME: The other checks should be redundant with allStackObjectsAreDead,
|
|
// but currently hasNonSpillStackObjects is set only from source
|
|
// allocas. Stack temps produced from legalization are not counted currently.
|
|
if (!allStackObjectsAreDead(MFI)) {
|
|
assert(RS && "RegScavenger required if spilling");
|
|
|
|
// Add an emergency spill slot
|
|
RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
|
|
|
|
// If we are spilling SGPRs to memory with a large frame, we may need a
|
|
// second VGPR emergency frame index.
|
|
if (HaveSGPRToVMemSpill &&
|
|
allocateScavengingFrameIndexesNearIncomingSP(MF)) {
|
|
RS->addScavengingFrameIndex(MFI.CreateSpillStackObject(4, Align(4)));
|
|
}
|
|
}
|
|
}
|
|
|
|
void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
|
|
MachineFunction &MF, RegScavenger *RS) const {
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
|
|
// On gfx908, we had initially reserved highest available VGPR for AGPR
|
|
// copy. Now since we are done with RA, check if there exist an unused VGPR
|
|
// which is lower than the eariler reserved VGPR before RA. If one exist,
|
|
// use it for AGPR copy instead of one reserved before RA.
|
|
Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
|
|
Register UnusedLowVGPR =
|
|
TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
|
|
if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
|
|
TRI->getHWRegIndex(VGPRForAGPRCopy))) {
|
|
// Reserve this newly identified VGPR (for AGPR copy)
|
|
// reserved registers should already be frozen at this point
|
|
// so we can avoid calling MRI.freezeReservedRegs and just use
|
|
// MRI.reserveReg
|
|
FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
|
|
MRI.reserveReg(UnusedLowVGPR, TRI);
|
|
}
|
|
}
|
|
// We initally reserved the highest available SGPR pair for long branches
|
|
// now, after RA, we shift down to a lower unused one if one exists
|
|
Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
|
|
Register UnusedLowSGPR =
|
|
TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
|
|
// If LongBranchReservedReg is null then we didn't find a long branch
|
|
// and never reserved a register to begin with so there is nothing to
|
|
// shift down. Then if UnusedLowSGPR is null, there isn't available lower
|
|
// register to use so just keep the original one we set.
|
|
if (LongBranchReservedReg && UnusedLowSGPR) {
|
|
FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
|
|
MRI.reserveReg(UnusedLowSGPR, TRI);
|
|
}
|
|
}
|
|
|
|
// The special SGPR spills like the one needed for FP, BP or any reserved
|
|
// registers delayed until frame lowering.
|
|
void SIFrameLowering::determinePrologEpilogSGPRSaves(
|
|
MachineFunction &MF, BitVector &SavedVGPRs,
|
|
bool NeedExecCopyReservedReg) const {
|
|
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
|
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
|
LiveRegUnits LiveUnits;
|
|
LiveUnits.init(*TRI);
|
|
// Initially mark callee saved registers as used so we will not choose them
|
|
// while looking for scratch SGPRs.
|
|
const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
|
|
for (unsigned I = 0; CSRegs[I]; ++I)
|
|
LiveUnits.addReg(CSRegs[I]);
|
|
|
|
const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
|
|
|
|
Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
|
|
if (NeedExecCopyReservedReg ||
|
|
(ReservedRegForExecCopy &&
|
|
MRI.isPhysRegUsed(ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
|
|
MRI.reserveReg(ReservedRegForExecCopy, TRI);
|
|
Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
|
|
if (UnusedScratchReg) {
|
|
// If found any unused scratch SGPR, reserve the register itself for Exec
|
|
// copy and there is no need for any spills in that case.
|
|
MFI->setSGPRForEXECCopy(UnusedScratchReg);
|
|
MRI.replaceRegWith(ReservedRegForExecCopy, UnusedScratchReg);
|
|
LiveUnits.addReg(UnusedScratchReg);
|
|
} else {
|
|
// Needs spill.
|
|
assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
|
|
"Re-reserving spill slot for EXEC copy register");
|
|
getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedRegForExecCopy, RC,
|
|
/*IncludeScratchCopy=*/false);
|
|
}
|
|
} else if (ReservedRegForExecCopy) {
|
|
// Reset it at this point. There are no whole-wave copies and spills
|
|
// encountered.
|
|
MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
|
|
}
|
|
|
|
// hasFP only knows about stack objects that already exist. We're now
|
|
// determining the stack slots that will be created, so we have to predict
|
|
// them. Stack objects force FP usage with calls.
|
|
//
|
|
// Note a new VGPR CSR may be introduced if one is used for the spill, but we
|
|
// don't want to report it here.
|
|
//
|
|
// FIXME: Is this really hasReservedCallFrame?
|
|
const bool WillHaveFP =
|
|
FrameInfo.hasCalls() &&
|
|
(SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
|
|
|
|
if (WillHaveFP || hasFP(MF)) {
|
|
Register FramePtrReg = MFI->getFrameOffsetReg();
|
|
assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
|
|
"Re-reserving spill slot for FP");
|
|
getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
|
|
}
|
|
|
|
if (TRI->hasBasePointer(MF)) {
|
|
Register BasePtrReg = TRI->getBaseRegister();
|
|
assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
|
|
"Re-reserving spill slot for BP");
|
|
getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
|
|
}
|
|
}
|
|
|
|
// Only report VGPRs to generic code.
|
|
void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
|
|
BitVector &SavedVGPRs,
|
|
RegScavenger *RS) const {
|
|
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
// If this is a function with the amdgpu_cs_chain[_preserve] calling
|
|
// convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
|
|
// we don't need to save and restore anything.
|
|
if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
|
|
return;
|
|
|
|
TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
bool NeedExecCopyReservedReg = false;
|
|
|
|
MachineInstr *ReturnMI = nullptr;
|
|
for (MachineBasicBlock &MBB : MF) {
|
|
for (MachineInstr &MI : MBB) {
|
|
// TODO: Walking through all MBBs here would be a bad heuristic. Better
|
|
// handle them elsewhere.
|
|
if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
|
|
NeedExecCopyReservedReg = true;
|
|
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
|
|
MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
|
|
MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
|
|
(MFI->isChainFunction() &&
|
|
TII->isChainCallOpcode(MI.getOpcode()))) {
|
|
// We expect all return to be the same size.
|
|
assert(!ReturnMI ||
|
|
(count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
|
|
count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
|
|
ReturnMI = &MI;
|
|
}
|
|
}
|
|
}
|
|
|
|
SmallVector<Register> SortedWWMVGPRs;
|
|
for (Register Reg : MFI->getWWMReservedRegs()) {
|
|
// The shift-back is needed only for the VGPRs used for SGPR spills and they
|
|
// are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
|
|
// reserved registers.
|
|
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
|
|
if (TRI->getRegSizeInBits(*RC) != 32)
|
|
continue;
|
|
SortedWWMVGPRs.push_back(Reg);
|
|
}
|
|
|
|
sort(SortedWWMVGPRs, std::greater<Register>());
|
|
MFI->shiftWwmVGPRsToLowestRange(MF, SortedWWMVGPRs, SavedVGPRs);
|
|
|
|
if (MFI->isEntryFunction())
|
|
return;
|
|
|
|
if (MFI->isWholeWaveFunction()) {
|
|
// In practice, all the VGPRs are WWM registers, and we will need to save at
|
|
// least their inactive lanes. Add them to WWMReservedRegs.
|
|
assert(!NeedExecCopyReservedReg &&
|
|
"Whole wave functions can use the reg mapped for their i1 argument");
|
|
|
|
// FIXME: Be more efficient!
|
|
for (MCRegister Reg : AMDGPU::VGPR_32RegClass)
|
|
if (MF.getRegInfo().isPhysRegModified(Reg)) {
|
|
MFI->reserveWWMRegister(Reg);
|
|
MF.begin()->addLiveIn(Reg);
|
|
}
|
|
MF.begin()->sortUniqueLiveIns();
|
|
}
|
|
|
|
// Remove any VGPRs used in the return value because these do not need to be saved.
|
|
// This prevents CSR restore from clobbering return VGPRs.
|
|
if (ReturnMI) {
|
|
for (auto &Op : ReturnMI->operands()) {
|
|
if (Op.isReg())
|
|
SavedVGPRs.reset(Op.getReg());
|
|
}
|
|
}
|
|
|
|
// Create the stack objects for WWM registers now.
|
|
for (Register Reg : MFI->getWWMReservedRegs()) {
|
|
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
|
|
MFI->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
|
|
TRI->getSpillAlign(*RC));
|
|
}
|
|
|
|
// Ignore the SGPRs the default implementation found.
|
|
SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
|
|
|
|
// Do not save AGPRs prior to GFX90A because there was no easy way to do so.
|
|
// In gfx908 there was do AGPR loads and stores and thus spilling also
|
|
// require a temporary VGPR.
|
|
if (!ST.hasGFX90AInsts())
|
|
SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
|
|
|
|
determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
|
|
|
|
// The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
|
|
// allow the default insertion to handle them.
|
|
for (auto &Reg : MFI->getWWMSpills())
|
|
SavedVGPRs.reset(Reg.first);
|
|
}
|
|
|
|
void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
|
|
BitVector &SavedRegs,
|
|
RegScavenger *RS) const {
|
|
TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
if (MFI->isEntryFunction())
|
|
return;
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
|
|
|
// The SP is specifically managed and we don't want extra spills of it.
|
|
SavedRegs.reset(MFI->getStackPtrOffsetReg());
|
|
|
|
const BitVector AllSavedRegs = SavedRegs;
|
|
SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
|
|
|
|
// We have to anticipate introducing CSR VGPR spills or spill of caller
|
|
// save VGPR reserved for SGPR spills as we now always create stack entry
|
|
// for it, if we don't have any stack objects already, since we require a FP
|
|
// if there is a call and stack. We will allocate a VGPR for SGPR spills if
|
|
// there are any SGPR spills. Whether they are CSR spills or otherwise.
|
|
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
|
const bool WillHaveFP =
|
|
FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
|
|
|
|
// FP will be specially managed like SP.
|
|
if (WillHaveFP || hasFP(MF))
|
|
SavedRegs.reset(MFI->getFrameOffsetReg());
|
|
|
|
// Return address use with return instruction is hidden through the SI_RETURN
|
|
// pseudo. Given that and since the IPRA computes actual register usage and
|
|
// does not use CSR list, the clobbering of return address by function calls
|
|
// (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
|
|
// usage collection. This will ensure save/restore of return address happens
|
|
// in those scenarios.
|
|
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
Register RetAddrReg = TRI->getReturnAddressReg(MF);
|
|
if (!MFI->isEntryFunction() &&
|
|
(FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
|
|
SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
|
|
SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
|
|
}
|
|
}
|
|
|
|
static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
|
|
const GCNSubtarget &ST,
|
|
std::vector<CalleeSavedInfo> &CSI,
|
|
unsigned &MinCSFrameIndex,
|
|
unsigned &MaxCSFrameIndex) {
|
|
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
|
|
|
assert(
|
|
llvm::is_sorted(CSI,
|
|
[](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
|
|
return A.getReg() < B.getReg();
|
|
}) &&
|
|
"Callee saved registers not sorted");
|
|
|
|
auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
|
|
return !CSI.isSpilledToReg() &&
|
|
TRI->getPhysRegBaseClass(CSI.getReg()) == &AMDGPU::VGPR_32RegClass &&
|
|
!FuncInfo->isWWMReservedRegister(CSI.getReg());
|
|
};
|
|
|
|
auto CSEnd = CSI.end();
|
|
for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
|
|
Register Reg = CSIt->getReg();
|
|
if (!CanUseBlockOps(*CSIt))
|
|
continue;
|
|
|
|
// Find all the regs that will fit in a 32-bit mask starting at the current
|
|
// reg and build said mask. It should have 1 for every register that's
|
|
// included, with the current register as the least significant bit.
|
|
uint32_t Mask = 1;
|
|
CSEnd = std::remove_if(
|
|
CSIt + 1, CSEnd, [&](const CalleeSavedInfo &CSI) -> bool {
|
|
if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) {
|
|
Mask |= 1 << (CSI.getReg() - Reg);
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
});
|
|
|
|
const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF);
|
|
Register RegBlock =
|
|
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, BlockRegClass);
|
|
if (!RegBlock) {
|
|
// We couldn't find a super register for the block. This can happen if
|
|
// the register we started with is too high (e.g. v232 if the maximum is
|
|
// v255). We therefore try to get the last register block and figure out
|
|
// the mask from there.
|
|
Register LastBlockStart =
|
|
AMDGPU::VGPR0 + alignDown(Reg - AMDGPU::VGPR0, 32);
|
|
RegBlock =
|
|
TRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass);
|
|
assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) &&
|
|
"Couldn't find super register");
|
|
int RegDelta = Reg - LastBlockStart;
|
|
assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta &&
|
|
"Bad shift amount");
|
|
Mask <<= RegDelta;
|
|
}
|
|
|
|
FuncInfo->setMaskForVGPRBlockOps(RegBlock, Mask);
|
|
|
|
// The stack objects can be a bit smaller than the register block if we know
|
|
// some of the high bits of Mask are 0. This may happen often with calling
|
|
// conventions where the caller and callee-saved VGPRs are interleaved at
|
|
// a small boundary (e.g. 8 or 16).
|
|
int UnusedBits = llvm::countl_zero(Mask);
|
|
unsigned BlockSize = TRI->getSpillSize(*BlockRegClass) - UnusedBits * 4;
|
|
int FrameIdx =
|
|
MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass),
|
|
/*isSpillSlot=*/true);
|
|
if ((unsigned)FrameIdx < MinCSFrameIndex)
|
|
MinCSFrameIndex = FrameIdx;
|
|
if ((unsigned)FrameIdx > MaxCSFrameIndex)
|
|
MaxCSFrameIndex = FrameIdx;
|
|
|
|
CSIt->setFrameIdx(FrameIdx);
|
|
CSIt->setReg(RegBlock);
|
|
}
|
|
CSI.erase(CSEnd, CSI.end());
|
|
}
|
|
|
|
bool SIFrameLowering::assignCalleeSavedSpillSlots(
|
|
MachineFunction &MF, const TargetRegisterInfo *TRI,
|
|
std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
|
|
unsigned &MaxCSFrameIndex) const {
|
|
if (CSI.empty())
|
|
return true; // Early exit if no callee saved registers are modified!
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
|
|
|
|
if (UseVGPRBlocks)
|
|
assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex);
|
|
|
|
return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks;
|
|
}
|
|
|
|
bool SIFrameLowering::assignCalleeSavedSpillSlots(
|
|
MachineFunction &MF, const TargetRegisterInfo *TRI,
|
|
std::vector<CalleeSavedInfo> &CSI) const {
|
|
if (CSI.empty())
|
|
return true; // Early exit if no callee saved registers are modified!
|
|
|
|
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIRegisterInfo *RI = ST.getRegisterInfo();
|
|
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
|
|
Register BasePtrReg = RI->getBaseRegister();
|
|
Register SGPRForFPSaveRestoreCopy =
|
|
FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
|
|
Register SGPRForBPSaveRestoreCopy =
|
|
FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
|
|
if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
|
|
return false;
|
|
|
|
unsigned NumModifiedRegs = 0;
|
|
|
|
if (SGPRForFPSaveRestoreCopy)
|
|
NumModifiedRegs++;
|
|
if (SGPRForBPSaveRestoreCopy)
|
|
NumModifiedRegs++;
|
|
|
|
for (auto &CS : CSI) {
|
|
if (CS.getReg() == FramePtrReg.asMCReg() && SGPRForFPSaveRestoreCopy) {
|
|
CS.setDstReg(SGPRForFPSaveRestoreCopy);
|
|
if (--NumModifiedRegs)
|
|
break;
|
|
} else if (CS.getReg() == BasePtrReg.asMCReg() &&
|
|
SGPRForBPSaveRestoreCopy) {
|
|
CS.setDstReg(SGPRForBPSaveRestoreCopy);
|
|
if (--NumModifiedRegs)
|
|
break;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
|
|
const MachineFunction &MF) const {
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
uint64_t EstStackSize = MFI.estimateStackSize(MF);
|
|
uint64_t MaxOffset = EstStackSize - 1;
|
|
|
|
// We need the emergency stack slots to be allocated in range of the
|
|
// MUBUF/flat scratch immediate offset from the base register, so assign these
|
|
// first at the incoming SP position.
|
|
//
|
|
// TODO: We could try sorting the objects to find a hole in the first bytes
|
|
// rather than allocating as close to possible. This could save a lot of space
|
|
// on frames with alignment requirements.
|
|
if (ST.enableFlatScratch()) {
|
|
if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
|
|
SIInstrFlags::FlatScratch))
|
|
return false;
|
|
} else {
|
|
if (TII->isLegalMUBUFImmOffset(MaxOffset))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool SIFrameLowering::spillCalleeSavedRegisters(
|
|
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
|
|
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
|
|
MachineFunction *MF = MBB.getParent();
|
|
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
|
|
if (!ST.useVGPRBlockOpsForCSR())
|
|
return false;
|
|
|
|
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
|
|
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
|
|
|
|
const TargetRegisterClass *BlockRegClass =
|
|
static_cast<const SIRegisterInfo *>(TRI)->getRegClassForBlockOp(*MF);
|
|
for (const CalleeSavedInfo &CS : CSI) {
|
|
Register Reg = CS.getReg();
|
|
if (!BlockRegClass->contains(Reg) ||
|
|
!FuncInfo->hasMaskForVGPRBlockOps(Reg)) {
|
|
spillCalleeSavedRegister(MBB, MI, CS, TII, TRI);
|
|
continue;
|
|
}
|
|
|
|
// Build a scratch block store.
|
|
uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
|
|
int FrameIndex = CS.getFrameIdx();
|
|
MachinePointerInfo PtrInfo =
|
|
MachinePointerInfo::getFixedStack(*MF, FrameIndex);
|
|
MachineMemOperand *MMO =
|
|
MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
|
|
FrameInfo.getObjectSize(FrameIndex),
|
|
FrameInfo.getObjectAlign(FrameIndex));
|
|
|
|
BuildMI(MBB, MI, MI->getDebugLoc(),
|
|
TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
|
|
.addReg(Reg, getKillRegState(false))
|
|
.addFrameIndex(FrameIndex)
|
|
.addReg(MFI->getStackPtrOffsetReg())
|
|
.addImm(0)
|
|
.addImm(Mask)
|
|
.addMemOperand(MMO);
|
|
|
|
FuncInfo->setHasSpilledVGPRs();
|
|
|
|
// Add the register to the liveins. This is necessary because if any of the
|
|
// VGPRs in the register block is reserved (e.g. if it's a WWM register),
|
|
// then the whole block will be marked as reserved and `updateLiveness` will
|
|
// skip it.
|
|
MBB.addLiveIn(Reg);
|
|
}
|
|
MBB.sortUniqueLiveIns();
|
|
|
|
return true;
|
|
}
|
|
|
|
bool SIFrameLowering::restoreCalleeSavedRegisters(
|
|
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
|
|
MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
|
|
MachineFunction *MF = MBB.getParent();
|
|
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
|
|
if (!ST.useVGPRBlockOpsForCSR())
|
|
return false;
|
|
|
|
SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
|
|
MachineFrameInfo &MFI = MF->getFrameInfo();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI);
|
|
const TargetRegisterClass *BlockRegClass = SITRI->getRegClassForBlockOp(*MF);
|
|
for (const CalleeSavedInfo &CS : reverse(CSI)) {
|
|
Register Reg = CS.getReg();
|
|
if (!BlockRegClass->contains(Reg) ||
|
|
!FuncInfo->hasMaskForVGPRBlockOps(Reg)) {
|
|
restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI);
|
|
continue;
|
|
}
|
|
|
|
// Build a scratch block load.
|
|
uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
|
|
int FrameIndex = CS.getFrameIdx();
|
|
MachinePointerInfo PtrInfo =
|
|
MachinePointerInfo::getFixedStack(*MF, FrameIndex);
|
|
MachineMemOperand *MMO = MF->getMachineMemOperand(
|
|
PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
|
|
MFI.getObjectAlign(FrameIndex));
|
|
|
|
auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
|
|
TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
|
|
.addFrameIndex(FrameIndex)
|
|
.addReg(FuncInfo->getStackPtrOffsetReg())
|
|
.addImm(0)
|
|
.addImm(Mask)
|
|
.addMemOperand(MMO);
|
|
SITRI->addImplicitUsesForBlockCSRLoad(MIB, Reg);
|
|
|
|
// Add the register to the liveins. This is necessary because if any of the
|
|
// VGPRs in the register block is reserved (e.g. if it's a WWM register),
|
|
// then the whole block will be marked as reserved and `updateLiveness` will
|
|
// skip it.
|
|
MBB.addLiveIn(Reg);
|
|
}
|
|
|
|
MBB.sortUniqueLiveIns();
|
|
return true;
|
|
}
|
|
|
|
MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
|
|
MachineFunction &MF,
|
|
MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator I) const {
|
|
int64_t Amount = I->getOperand(0).getImm();
|
|
if (Amount == 0)
|
|
return MBB.erase(I);
|
|
|
|
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
|
const SIInstrInfo *TII = ST.getInstrInfo();
|
|
const DebugLoc &DL = I->getDebugLoc();
|
|
unsigned Opc = I->getOpcode();
|
|
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
|
|
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
|
|
|
|
if (!hasReservedCallFrame(MF)) {
|
|
Amount = alignTo(Amount, getStackAlign());
|
|
assert(isUInt<32>(Amount) && "exceeded stack address space size");
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
Register SPReg = MFI->getStackPtrOffsetReg();
|
|
|
|
Amount *= getScratchScaleFactor(ST);
|
|
if (IsDestroy)
|
|
Amount = -Amount;
|
|
auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
|
|
.addReg(SPReg)
|
|
.addImm(Amount);
|
|
Add->getOperand(3).setIsDead(); // Mark SCC as dead.
|
|
} else if (CalleePopAmount != 0) {
|
|
llvm_unreachable("is this used?");
|
|
}
|
|
|
|
return MBB.erase(I);
|
|
}
|
|
|
|
/// Returns true if the frame will require a reference to the stack pointer.
|
|
///
|
|
/// This is the set of conditions common to setting up the stack pointer in a
|
|
/// kernel, and for using a frame pointer in a callable function.
|
|
///
|
|
/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
|
|
/// references SP.
|
|
static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
|
|
return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
|
|
}
|
|
|
|
// The FP for kernels is always known 0, so we never really need to setup an
|
|
// explicit register for it. However, DisableFramePointerElim will force us to
|
|
// use a register for it.
|
|
bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
|
|
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
|
|
// For entry & chain functions we can use an immediate offset in most cases,
|
|
// so the presence of calls doesn't imply we need a distinct frame pointer.
|
|
if (MFI.hasCalls() &&
|
|
!MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
|
|
!MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
|
|
// All offsets are unsigned, so need to be addressed in the same direction
|
|
// as stack growth.
|
|
|
|
// FIXME: This function is pretty broken, since it can be called before the
|
|
// frame layout is determined or CSR spills are inserted.
|
|
return MFI.getStackSize() != 0;
|
|
}
|
|
|
|
return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
|
|
MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
|
|
MF) ||
|
|
mayReserveScratchForCWSR(MF) ||
|
|
MF.getTarget().Options.DisableFramePointerElim(MF);
|
|
}
|
|
|
|
bool SIFrameLowering::mayReserveScratchForCWSR(
|
|
const MachineFunction &MF) const {
|
|
return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
|
|
AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) &&
|
|
AMDGPU::isCompute(MF.getFunction().getCallingConv());
|
|
}
|
|
|
|
// This is essentially a reduced version of hasFP for entry functions. Since the
|
|
// stack pointer is known 0 on entry to kernels, we never really need an FP
|
|
// register. We may need to initialize the stack pointer depending on the frame
|
|
// properties, which logically overlaps many of the cases where an ordinary
|
|
// function would require an FP.
|
|
// Also used for chain functions. While not technically entry functions, chain
|
|
// functions may need to set up a stack pointer in some situations.
|
|
bool SIFrameLowering::requiresStackPointerReference(
|
|
const MachineFunction &MF) const {
|
|
// Callable functions always require a stack pointer reference.
|
|
assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
|
|
MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
|
|
"only expected to call this for entry points and chain functions");
|
|
|
|
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
|
|
|
// Entry points ordinarily don't need to initialize SP. We have to set it up
|
|
// for callees if there are any. Also note tail calls are impossible/don't
|
|
// make any sense for kernels.
|
|
if (MFI.hasCalls())
|
|
return true;
|
|
|
|
// We still need to initialize the SP if we're doing anything weird that
|
|
// references the SP, like variable sized stack objects.
|
|
return frameTriviallyRequiresSP(MFI);
|
|
}
|