This extends the MachineSMEABIPass to handle agnostic ZA functions. This
case is currently handled like shared ZA functions, but we don't require
ZA state to be reloaded before agnostic ZA calls.
Note: This patch does not yet fully handle agnostic ZA functions that
can catch exceptions. E.g.:
```
__arm_agnostic("sme_za_state") void try_catch_agnostic_za_callee()
{
try {
agnostic_za_call();
} catch(...) {
noexcept_agnostic_za_call();
}
}
```
As in this case, we won't commit a ZA save before the
`agnostic_za_call()`, which would be needed to restore ZA in the catch
block. This will be handled in a later patch.
Change-Id: I9cce7b42ec8b64d5442b35231b65dfaf9d149eed
855 lines
33 KiB
C++
855 lines
33 KiB
C++
//===- MachineSMEABIPass.cpp ----------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass implements the SME ABI requirements for ZA state. This includes
|
|
// implementing the lazy (and agnostic) ZA state save schemes around calls.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass works by collecting instructions that require ZA to be in a
|
|
// specific state (e.g., "ACTIVE" or "SAVED") and inserting the necessary state
|
|
// transitions to ensure ZA is in the required state before instructions. State
|
|
// transitions represent actions such as setting up or restoring a lazy save.
|
|
// Certain points within a function may also have predefined states independent
|
|
// of any instructions, for example, a "shared_za" function is always entered
|
|
// and exited in the "ACTIVE" state.
|
|
//
|
|
// To handle ZA state across control flow, we make use of edge bundling. This
|
|
// assigns each block an "incoming" and "outgoing" edge bundle (representing
|
|
// incoming and outgoing edges). Initially, these are unique to each block;
|
|
// then, in the process of forming bundles, the outgoing block of a block is
|
|
// joined with the incoming bundle of all successors. The result is that each
|
|
// bundle can be assigned a single ZA state, which ensures the state required by
|
|
// all a blocks' successors is the same, and that each basic block will always
|
|
// be entered with the same ZA state. This eliminates the need for splitting
|
|
// edges to insert state transitions or "phi" nodes for ZA states.
|
|
//
|
|
// See below for a simple example of edge bundling.
|
|
//
|
|
// The following shows a conditionally executed basic block (BB1):
|
|
//
|
|
// if (cond)
|
|
// BB1
|
|
// BB2
|
|
//
|
|
// Initial Bundles Joined Bundles
|
|
//
|
|
// ┌──0──┐ ┌──0──┐
|
|
// │ BB0 │ │ BB0 │
|
|
// └──1──┘ └──1──┘
|
|
// ├───────┐ ├───────┐
|
|
// ▼ │ ▼ │
|
|
// ┌──2──┐ │ ─────► ┌──1──┐ │
|
|
// │ BB1 │ ▼ │ BB1 │ ▼
|
|
// └──3──┘ ┌──4──┐ └──1──┘ ┌──1──┐
|
|
// └───►4 BB2 │ └───►1 BB2 │
|
|
// └──5──┘ └──2──┘
|
|
//
|
|
// On the left are the initial per-block bundles, and on the right are the
|
|
// joined bundles (which are the result of the EdgeBundles analysis).
|
|
|
|
#include "AArch64InstrInfo.h"
|
|
#include "AArch64MachineFunctionInfo.h"
|
|
#include "AArch64Subtarget.h"
|
|
#include "MCTargetDesc/AArch64AddressingModes.h"
|
|
#include "llvm/ADT/BitmaskEnum.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/CodeGen/EdgeBundles.h"
|
|
#include "llvm/CodeGen/LivePhysRegs.h"
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
|
#include "llvm/CodeGen/TargetRegisterInfo.h"
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "aarch64-machine-sme-abi"
|
|
|
|
namespace {
|
|
|
|
enum ZAState {
|
|
// Any/unknown state (not valid)
|
|
ANY = 0,
|
|
|
|
// ZA is in use and active (i.e. within the accumulator)
|
|
ACTIVE,
|
|
|
|
// A ZA save has been set up or committed (i.e. ZA is dormant or off)
|
|
LOCAL_SAVED,
|
|
|
|
// ZA is off or a lazy save has been set up by the caller
|
|
CALLER_DORMANT,
|
|
|
|
// ZA is off
|
|
OFF,
|
|
|
|
// The number of ZA states (not a valid state)
|
|
NUM_ZA_STATE
|
|
};
|
|
|
|
/// A bitmask enum to record live physical registers that the "emit*" routines
|
|
/// may need to preserve. Note: This only tracks registers we may clobber.
|
|
enum LiveRegs : uint8_t {
|
|
None = 0,
|
|
NZCV = 1 << 0,
|
|
W0 = 1 << 1,
|
|
W0_HI = 1 << 2,
|
|
X0 = W0 | W0_HI,
|
|
LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ W0_HI)
|
|
};
|
|
|
|
/// Holds the virtual registers live physical registers have been saved to.
|
|
struct PhysRegSave {
|
|
LiveRegs PhysLiveRegs;
|
|
Register StatusFlags = AArch64::NoRegister;
|
|
Register X0Save = AArch64::NoRegister;
|
|
};
|
|
|
|
static bool isLegalEdgeBundleZAState(ZAState State) {
|
|
switch (State) {
|
|
case ZAState::ACTIVE:
|
|
case ZAState::LOCAL_SAVED:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
struct TPIDR2State {
|
|
int FrameIndex = -1;
|
|
};
|
|
|
|
StringRef getZAStateString(ZAState State) {
|
|
#define MAKE_CASE(V) \
|
|
case V: \
|
|
return #V;
|
|
switch (State) {
|
|
MAKE_CASE(ZAState::ANY)
|
|
MAKE_CASE(ZAState::ACTIVE)
|
|
MAKE_CASE(ZAState::LOCAL_SAVED)
|
|
MAKE_CASE(ZAState::CALLER_DORMANT)
|
|
MAKE_CASE(ZAState::OFF)
|
|
default:
|
|
llvm_unreachable("Unexpected ZAState");
|
|
}
|
|
#undef MAKE_CASE
|
|
}
|
|
|
|
static bool isZAorZT0RegOp(const TargetRegisterInfo &TRI,
|
|
const MachineOperand &MO) {
|
|
if (!MO.isReg() || !MO.getReg().isPhysical())
|
|
return false;
|
|
return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) {
|
|
return AArch64::MPR128RegClass.contains(SR) ||
|
|
AArch64::ZTRRegClass.contains(SR);
|
|
});
|
|
}
|
|
|
|
/// Returns the required ZA state needed before \p MI and an iterator pointing
|
|
/// to where any code required to change the ZA state should be inserted.
|
|
static std::pair<ZAState, MachineBasicBlock::iterator>
|
|
getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
|
|
bool ZAOffAtReturn) {
|
|
MachineBasicBlock::iterator InsertPt(MI);
|
|
|
|
if (MI.getOpcode() == AArch64::InOutZAUsePseudo)
|
|
return {ZAState::ACTIVE, std::prev(InsertPt)};
|
|
|
|
if (MI.getOpcode() == AArch64::RequiresZASavePseudo)
|
|
return {ZAState::LOCAL_SAVED, std::prev(InsertPt)};
|
|
|
|
if (MI.isReturn())
|
|
return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
|
|
|
|
for (auto &MO : MI.operands()) {
|
|
if (isZAorZT0RegOp(TRI, MO))
|
|
return {ZAState::ACTIVE, InsertPt};
|
|
}
|
|
|
|
return {ZAState::ANY, InsertPt};
|
|
}
|
|
|
|
struct MachineSMEABI : public MachineFunctionPass {
|
|
inline static char ID = 0;
|
|
|
|
MachineSMEABI() : MachineFunctionPass(ID) {}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
|
|
StringRef getPassName() const override { return "Machine SME ABI pass"; }
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesCFG();
|
|
AU.addRequired<EdgeBundlesWrapperLegacy>();
|
|
AU.addPreservedID(MachineLoopInfoID);
|
|
AU.addPreservedID(MachineDominatorsID);
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
|
|
/// Collects the needed ZA state (and live registers) before each instruction
|
|
/// within the machine function.
|
|
void collectNeededZAStates(SMEAttrs);
|
|
|
|
/// Assigns each edge bundle a ZA state based on the needed states of blocks
|
|
/// that have incoming or outgoing edges in that bundle.
|
|
void assignBundleZAStates();
|
|
|
|
/// Inserts code to handle changes between ZA states within the function.
|
|
/// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
|
|
void insertStateChanges(bool IsAgnosticZA);
|
|
|
|
// Emission routines for private and shared ZA functions (using lazy saves).
|
|
void emitNewZAPrologue(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI);
|
|
void emitRestoreLazySave(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
LiveRegs PhysLiveRegs);
|
|
void emitSetupLazySave(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI);
|
|
void emitAllocateLazySaveBuffer(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI);
|
|
void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
|
|
bool ClearTPIDR2);
|
|
|
|
// Emission routines for agnostic ZA functions.
|
|
void emitSetupFullZASave(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
LiveRegs PhysLiveRegs);
|
|
void emitFullZASaveRestore(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
LiveRegs PhysLiveRegs, bool IsSave);
|
|
void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
LiveRegs PhysLiveRegs);
|
|
|
|
void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
|
|
ZAState From, ZAState To, LiveRegs PhysLiveRegs,
|
|
bool IsAgnosticZA);
|
|
|
|
// Helpers for switching between lazy/full ZA save/restore routines.
|
|
void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
|
|
LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
|
|
if (IsAgnosticZA)
|
|
return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true);
|
|
return emitSetupLazySave(MBB, MBBI);
|
|
}
|
|
void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
|
|
LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
|
|
if (IsAgnosticZA)
|
|
return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false);
|
|
return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs);
|
|
}
|
|
void emitAllocateZASaveBuffer(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
|
|
if (IsAgnosticZA)
|
|
return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs);
|
|
return emitAllocateLazySaveBuffer(MBB, MBBI);
|
|
}
|
|
|
|
/// Save live physical registers to virtual registers.
|
|
PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI, DebugLoc DL);
|
|
/// Restore physical registers from a save of their previous values.
|
|
void restorePhyRegSave(PhysRegSave const &RegSave, MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI, DebugLoc DL);
|
|
|
|
/// Get or create a TPIDR2 block in this function.
|
|
TPIDR2State getTPIDR2Block();
|
|
|
|
Register getAgnosticZABufferPtr();
|
|
|
|
private:
|
|
/// Contains the needed ZA state (and live registers) at an instruction.
|
|
struct InstInfo {
|
|
ZAState NeededState{ZAState::ANY};
|
|
MachineBasicBlock::iterator InsertPt;
|
|
LiveRegs PhysLiveRegs = LiveRegs::None;
|
|
};
|
|
|
|
/// Contains the needed ZA state for each instruction in a block.
|
|
/// Instructions that do not require a ZA state are not recorded.
|
|
struct BlockInfo {
|
|
ZAState FixedEntryState{ZAState::ANY};
|
|
SmallVector<InstInfo> Insts;
|
|
LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
|
|
LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
|
|
};
|
|
|
|
// All pass state that must be cleared between functions.
|
|
struct PassState {
|
|
SmallVector<BlockInfo> Blocks;
|
|
SmallVector<ZAState> BundleStates;
|
|
std::optional<TPIDR2State> TPIDR2Block;
|
|
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
|
|
Register AgnosticZABufferPtr = AArch64::NoRegister;
|
|
LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
|
|
bool HasFullZASaveRestore = false;
|
|
} State;
|
|
|
|
MachineFunction *MF = nullptr;
|
|
EdgeBundles *Bundles = nullptr;
|
|
const AArch64Subtarget *Subtarget = nullptr;
|
|
const AArch64RegisterInfo *TRI = nullptr;
|
|
const TargetInstrInfo *TII = nullptr;
|
|
MachineRegisterInfo *MRI = nullptr;
|
|
};
|
|
|
|
void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
|
|
assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
|
|
SMEFnAttrs.hasZAState()) &&
|
|
"Expected function to have ZA/ZT0 state!");
|
|
|
|
State.Blocks.resize(MF->getNumBlockIDs());
|
|
for (MachineBasicBlock &MBB : *MF) {
|
|
BlockInfo &Block = State.Blocks[MBB.getNumber()];
|
|
if (&MBB == &MF->front()) {
|
|
// Entry block:
|
|
Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface()
|
|
? ZAState::CALLER_DORMANT
|
|
: ZAState::ACTIVE;
|
|
} else if (MBB.isEHPad()) {
|
|
// EH entry block:
|
|
Block.FixedEntryState = ZAState::LOCAL_SAVED;
|
|
}
|
|
|
|
LiveRegUnits LiveUnits(*TRI);
|
|
LiveUnits.addLiveOuts(MBB);
|
|
|
|
auto GetPhysLiveRegs = [&] {
|
|
LiveRegs PhysLiveRegs = LiveRegs::None;
|
|
if (!LiveUnits.available(AArch64::NZCV))
|
|
PhysLiveRegs |= LiveRegs::NZCV;
|
|
// We have to track W0 and X0 separately as otherwise things can get
|
|
// confused if we attempt to preserve X0 but only W0 was defined.
|
|
if (!LiveUnits.available(AArch64::W0))
|
|
PhysLiveRegs |= LiveRegs::W0;
|
|
if (!LiveUnits.available(AArch64::W0_HI))
|
|
PhysLiveRegs |= LiveRegs::W0_HI;
|
|
return PhysLiveRegs;
|
|
};
|
|
|
|
Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
|
|
auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
|
|
auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
|
|
for (MachineInstr &MI : reverse(MBB)) {
|
|
MachineBasicBlock::iterator MBBI(MI);
|
|
LiveUnits.stepBackward(MI);
|
|
LiveRegs PhysLiveRegs = GetPhysLiveRegs();
|
|
// The SMEStateAllocPseudo marker is added to a function if the save
|
|
// buffer was allocated in SelectionDAG. It marks the end of the
|
|
// allocation -- which is a safe point for this pass to insert any TPIDR2
|
|
// block setup.
|
|
if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
|
|
State.AfterSMEProloguePt = MBBI;
|
|
State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
|
|
}
|
|
// Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
|
|
auto [NeededState, InsertPt] = getZAStateBeforeInst(
|
|
*TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
|
|
assert((InsertPt == MBBI ||
|
|
InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) &&
|
|
"Unexpected state change insertion point!");
|
|
// TODO: Do something to avoid state changes where NZCV is live.
|
|
if (MBBI == FirstTerminatorInsertPt)
|
|
Block.PhysLiveRegsAtExit = PhysLiveRegs;
|
|
if (MBBI == FirstNonPhiInsertPt)
|
|
Block.PhysLiveRegsAtEntry = PhysLiveRegs;
|
|
if (NeededState != ZAState::ANY)
|
|
Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
|
|
}
|
|
|
|
// Reverse vector (as we had to iterate backwards for liveness).
|
|
std::reverse(Block.Insts.begin(), Block.Insts.end());
|
|
}
|
|
}
|
|
|
|
void MachineSMEABI::assignBundleZAStates() {
|
|
State.BundleStates.resize(Bundles->getNumBundles());
|
|
for (unsigned I = 0, E = Bundles->getNumBundles(); I != E; ++I) {
|
|
LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n');
|
|
|
|
// Attempt to assign a ZA state for this bundle that minimizes state
|
|
// transitions. Edges within loops are given a higher weight as we assume
|
|
// they will be executed more than once.
|
|
// TODO: We should propagate desired incoming/outgoing states through blocks
|
|
// that have the "ANY" state first to make better global decisions.
|
|
int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
|
|
for (unsigned BlockID : Bundles->getBlocks(I)) {
|
|
LLVM_DEBUG(dbgs() << "- bb." << BlockID);
|
|
|
|
const BlockInfo &Block = State.Blocks[BlockID];
|
|
if (Block.Insts.empty()) {
|
|
LLVM_DEBUG(dbgs() << " (no state preference)\n");
|
|
continue;
|
|
}
|
|
bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
|
|
bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
|
|
|
|
ZAState DesiredIncomingState = Block.Insts.front().NeededState;
|
|
if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
|
|
EdgeStateCounts[DesiredIncomingState]++;
|
|
LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
|
|
<< getZAStateString(DesiredIncomingState));
|
|
}
|
|
ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
|
|
if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
|
|
EdgeStateCounts[DesiredOutgoingState]++;
|
|
LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
|
|
<< getZAStateString(DesiredOutgoingState));
|
|
}
|
|
LLVM_DEBUG(dbgs() << '\n');
|
|
}
|
|
|
|
ZAState BundleState =
|
|
ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
|
|
|
|
// Force ZA to be active in bundles that don't have a preferred state.
|
|
// TODO: Something better here (to avoid extra mode switches).
|
|
if (BundleState == ZAState::ANY)
|
|
BundleState = ZAState::ACTIVE;
|
|
|
|
LLVM_DEBUG({
|
|
dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n'
|
|
<< "Edge counts:";
|
|
for (auto [State, Count] : enumerate(EdgeStateCounts))
|
|
dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count;
|
|
dbgs() << "\n\n";
|
|
});
|
|
|
|
State.BundleStates[I] = BundleState;
|
|
}
|
|
}
|
|
|
|
void MachineSMEABI::insertStateChanges(bool IsAgnosticZA) {
|
|
for (MachineBasicBlock &MBB : *MF) {
|
|
const BlockInfo &Block = State.Blocks[MBB.getNumber()];
|
|
ZAState InState = State.BundleStates[Bundles->getBundle(MBB.getNumber(),
|
|
/*Out=*/false)];
|
|
|
|
ZAState CurrentState = Block.FixedEntryState;
|
|
if (CurrentState == ZAState::ANY)
|
|
CurrentState = InState;
|
|
|
|
for (auto &Inst : Block.Insts) {
|
|
if (CurrentState != Inst.NeededState)
|
|
emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState,
|
|
Inst.PhysLiveRegs, IsAgnosticZA);
|
|
CurrentState = Inst.NeededState;
|
|
}
|
|
|
|
if (MBB.succ_empty())
|
|
continue;
|
|
|
|
ZAState OutState =
|
|
State.BundleStates[Bundles->getBundle(MBB.getNumber(), /*Out=*/true)];
|
|
if (CurrentState != OutState)
|
|
emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState,
|
|
Block.PhysLiveRegsAtExit, IsAgnosticZA);
|
|
}
|
|
}
|
|
|
|
TPIDR2State MachineSMEABI::getTPIDR2Block() {
|
|
if (State.TPIDR2Block)
|
|
return *State.TPIDR2Block;
|
|
MachineFrameInfo &MFI = MF->getFrameInfo();
|
|
State.TPIDR2Block = TPIDR2State{MFI.CreateStackObject(16, Align(16), false)};
|
|
return *State.TPIDR2Block;
|
|
}
|
|
|
|
static DebugLoc getDebugLoc(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI) {
|
|
if (MBBI != MBB.end())
|
|
return MBBI->getDebugLoc();
|
|
return DebugLoc();
|
|
}
|
|
|
|
void MachineSMEABI::emitSetupLazySave(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI) {
|
|
DebugLoc DL = getDebugLoc(MBB, MBBI);
|
|
|
|
// Get pointer to TPIDR2 block.
|
|
Register TPIDR2 = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
|
|
Register TPIDR2Ptr = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
|
|
.addFrameIndex(getTPIDR2Block().FrameIndex)
|
|
.addImm(0)
|
|
.addImm(0);
|
|
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), TPIDR2Ptr)
|
|
.addReg(TPIDR2);
|
|
// Set TPIDR2_EL0 to point to TPIDR2 block.
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
|
|
.addImm(AArch64SysReg::TPIDR2_EL0)
|
|
.addReg(TPIDR2Ptr);
|
|
}
|
|
|
|
PhysRegSave MachineSMEABI::createPhysRegSave(LiveRegs PhysLiveRegs,
|
|
MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
DebugLoc DL) {
|
|
PhysRegSave RegSave{PhysLiveRegs};
|
|
if (PhysLiveRegs & LiveRegs::NZCV) {
|
|
RegSave.StatusFlags = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), RegSave.StatusFlags)
|
|
.addImm(AArch64SysReg::NZCV)
|
|
.addReg(AArch64::NZCV, RegState::Implicit);
|
|
}
|
|
// Note: Preserving X0 is "free" as this is before register allocation, so
|
|
// the register allocator is still able to optimize these copies.
|
|
if (PhysLiveRegs & LiveRegs::W0) {
|
|
RegSave.X0Save = MRI->createVirtualRegister(PhysLiveRegs & LiveRegs::W0_HI
|
|
? &AArch64::GPR64RegClass
|
|
: &AArch64::GPR32RegClass);
|
|
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), RegSave.X0Save)
|
|
.addReg(PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0);
|
|
}
|
|
return RegSave;
|
|
}
|
|
|
|
void MachineSMEABI::restorePhyRegSave(PhysRegSave const &RegSave,
|
|
MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
DebugLoc DL) {
|
|
if (RegSave.StatusFlags != AArch64::NoRegister)
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
|
|
.addImm(AArch64SysReg::NZCV)
|
|
.addReg(RegSave.StatusFlags)
|
|
.addReg(AArch64::NZCV, RegState::ImplicitDefine);
|
|
|
|
if (RegSave.X0Save != AArch64::NoRegister)
|
|
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY),
|
|
RegSave.PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0)
|
|
.addReg(RegSave.X0Save);
|
|
}
|
|
|
|
void MachineSMEABI::emitRestoreLazySave(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
LiveRegs PhysLiveRegs) {
|
|
auto *TLI = Subtarget->getTargetLowering();
|
|
DebugLoc DL = getDebugLoc(MBB, MBBI);
|
|
Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
|
|
Register TPIDR2 = AArch64::X0;
|
|
|
|
// TODO: Emit these within the restore MBB to prevent unnecessary saves.
|
|
PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
|
|
|
|
// Enable ZA.
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
|
|
.addImm(AArch64SVCR::SVCRZA)
|
|
.addImm(1);
|
|
// Get current TPIDR2_EL0.
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), TPIDR2EL0)
|
|
.addImm(AArch64SysReg::TPIDR2_EL0);
|
|
// Get pointer to TPIDR2 block.
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
|
|
.addFrameIndex(getTPIDR2Block().FrameIndex)
|
|
.addImm(0)
|
|
.addImm(0);
|
|
// (Conditionally) restore ZA state.
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::RestoreZAPseudo))
|
|
.addReg(TPIDR2EL0)
|
|
.addReg(TPIDR2)
|
|
.addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_RESTORE))
|
|
.addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
|
|
// Zero TPIDR2_EL0.
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
|
|
.addImm(AArch64SysReg::TPIDR2_EL0)
|
|
.addReg(AArch64::XZR);
|
|
|
|
restorePhyRegSave(RegSave, MBB, MBBI, DL);
|
|
}
|
|
|
|
void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
bool ClearTPIDR2) {
|
|
DebugLoc DL = getDebugLoc(MBB, MBBI);
|
|
|
|
if (ClearTPIDR2)
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
|
|
.addImm(AArch64SysReg::TPIDR2_EL0)
|
|
.addReg(AArch64::XZR);
|
|
|
|
// Disable ZA.
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
|
|
.addImm(AArch64SVCR::SVCRZA)
|
|
.addImm(0);
|
|
}
|
|
|
|
void MachineSMEABI::emitAllocateLazySaveBuffer(
|
|
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
|
|
MachineFrameInfo &MFI = MF->getFrameInfo();
|
|
auto *AFI = MF->getInfo<AArch64FunctionInfo>();
|
|
|
|
DebugLoc DL = getDebugLoc(MBB, MBBI);
|
|
Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
|
|
Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
|
|
Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
|
|
|
|
// Calculate SVL.
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
|
|
|
|
// 1. Allocate the lazy save buffer.
|
|
if (Buffer == AArch64::NoRegister) {
|
|
// TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
|
|
// Buffer != AArch64::NoRegister). This is done to reuse the existing
|
|
// expansions (which can insert stack checks). This works, but it means we
|
|
// will always allocate the lazy save buffer (even if the function contains
|
|
// no lazy saves). If we want to handle Windows here, we'll need to
|
|
// implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
|
|
assert(!Subtarget->isTargetWindows() &&
|
|
"Lazy ZA save is not yet supported on Windows");
|
|
Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
|
|
// Get original stack pointer.
|
|
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
|
|
.addReg(AArch64::SP);
|
|
// Allocate a lazy-save buffer object of the size given, normally SVL * SVL
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSUBXrrr), Buffer)
|
|
.addReg(SVL)
|
|
.addReg(SVL)
|
|
.addReg(SP);
|
|
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::SP)
|
|
.addReg(Buffer);
|
|
// We have just allocated a variable sized object, tell this to PEI.
|
|
MFI.CreateVariableSizedObject(Align(16), nullptr);
|
|
}
|
|
|
|
// 2. Setup the TPIDR2 block.
|
|
{
|
|
// Note: This case just needs to do `SVL << 48`. It is not implemented as we
|
|
// generally don't support big-endian SVE/SME.
|
|
if (!Subtarget->isLittleEndian())
|
|
reportFatalInternalError(
|
|
"TPIDR2 block initialization is not supported on big-endian targets");
|
|
|
|
// Store buffer pointer and num_za_save_slices.
|
|
// Bytes 10-15 are implicitly zeroed.
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::STPXi))
|
|
.addReg(Buffer)
|
|
.addReg(SVL)
|
|
.addFrameIndex(getTPIDR2Block().FrameIndex)
|
|
.addImm(0);
|
|
}
|
|
}
|
|
|
|
void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI) {
|
|
auto *TLI = Subtarget->getTargetLowering();
|
|
DebugLoc DL = getDebugLoc(MBB, MBBI);
|
|
|
|
// Get current TPIDR2_EL0.
|
|
Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS))
|
|
.addReg(TPIDR2EL0, RegState::Define)
|
|
.addImm(AArch64SysReg::TPIDR2_EL0);
|
|
// If TPIDR2_EL0 is non-zero, commit the lazy save.
|
|
// NOTE: Functions that only use ZT0 don't need to zero ZA.
|
|
bool ZeroZA =
|
|
MF->getInfo<AArch64FunctionInfo>()->getSMEFnAttrs().hasZAState();
|
|
auto CommitZASave =
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo))
|
|
.addReg(TPIDR2EL0)
|
|
.addImm(ZeroZA ? 1 : 0)
|
|
.addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE))
|
|
.addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
|
|
if (ZeroZA)
|
|
CommitZASave.addDef(AArch64::ZAB0, RegState::ImplicitDefine);
|
|
// Enable ZA (as ZA could have previously been in the OFF state).
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
|
|
.addImm(AArch64SVCR::SVCRZA)
|
|
.addImm(1);
|
|
}
|
|
|
|
Register MachineSMEABI::getAgnosticZABufferPtr() {
|
|
if (State.AgnosticZABufferPtr != AArch64::NoRegister)
|
|
return State.AgnosticZABufferPtr;
|
|
if (auto BufferPtr =
|
|
MF->getInfo<AArch64FunctionInfo>()->getEarlyAllocSMESaveBuffer();
|
|
BufferPtr != AArch64::NoRegister)
|
|
State.AgnosticZABufferPtr = BufferPtr;
|
|
else
|
|
State.AgnosticZABufferPtr =
|
|
MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
|
|
return State.AgnosticZABufferPtr;
|
|
}
|
|
|
|
void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator MBBI,
|
|
LiveRegs PhysLiveRegs, bool IsSave) {
|
|
auto *TLI = Subtarget->getTargetLowering();
|
|
State.HasFullZASaveRestore = true;
|
|
DebugLoc DL = getDebugLoc(MBB, MBBI);
|
|
Register BufferPtr = AArch64::X0;
|
|
|
|
PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
|
|
|
|
// Copy the buffer pointer into X0.
|
|
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
|
|
.addReg(getAgnosticZABufferPtr());
|
|
|
|
// Call __arm_sme_save/__arm_sme_restore.
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
|
|
.addReg(BufferPtr, RegState::Implicit)
|
|
.addExternalSymbol(TLI->getLibcallName(
|
|
IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE))
|
|
.addRegMask(TRI->getCallPreservedMask(
|
|
*MF,
|
|
CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
|
|
|
|
restorePhyRegSave(RegSave, MBB, MBBI, DL);
|
|
}
|
|
|
|
void MachineSMEABI::emitAllocateFullZASaveBuffer(
|
|
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
|
|
LiveRegs PhysLiveRegs) {
|
|
auto *AFI = MF->getInfo<AArch64FunctionInfo>();
|
|
|
|
// Buffer already allocated in SelectionDAG.
|
|
if (AFI->getEarlyAllocSMESaveBuffer())
|
|
return;
|
|
|
|
DebugLoc DL = getDebugLoc(MBB, MBBI);
|
|
Register BufferPtr = getAgnosticZABufferPtr();
|
|
Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
|
|
|
|
PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
|
|
|
|
// Calculate the SME state size.
|
|
{
|
|
auto *TLI = Subtarget->getTargetLowering();
|
|
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
|
|
.addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_SME_STATE_SIZE))
|
|
.addReg(AArch64::X0, RegState::ImplicitDefine)
|
|
.addRegMask(TRI->getCallPreservedMask(
|
|
*MF, CallingConv::
|
|
AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
|
|
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferSize)
|
|
.addReg(AArch64::X0);
|
|
}
|
|
|
|
// Allocate a buffer object of the size given __arm_sme_state_size.
|
|
{
|
|
MachineFrameInfo &MFI = MF->getFrameInfo();
|
|
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
|
|
.addReg(AArch64::SP)
|
|
.addReg(BufferSize)
|
|
.addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0));
|
|
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
|
|
.addReg(AArch64::SP);
|
|
|
|
// We have just allocated a variable sized object, tell this to PEI.
|
|
MFI.CreateVariableSizedObject(Align(16), nullptr);
|
|
}
|
|
|
|
restorePhyRegSave(RegSave, MBB, MBBI, DL);
|
|
}
|
|
|
|
void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
|
|
MachineBasicBlock::iterator InsertPt,
|
|
ZAState From, ZAState To,
|
|
LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
|
|
|
|
// ZA not used.
|
|
if (From == ZAState::ANY || To == ZAState::ANY)
|
|
return;
|
|
|
|
// If we're exiting from the CALLER_DORMANT state that means this new ZA
|
|
// function did not touch ZA (so ZA was never turned on).
|
|
if (From == ZAState::CALLER_DORMANT && To == ZAState::OFF)
|
|
return;
|
|
|
|
// TODO: Avoid setting up the save buffer if there's no transition to
|
|
// LOCAL_SAVED.
|
|
if (From == ZAState::CALLER_DORMANT) {
|
|
assert(MBB.getParent()
|
|
->getInfo<AArch64FunctionInfo>()
|
|
->getSMEFnAttrs()
|
|
.hasPrivateZAInterface() &&
|
|
"CALLER_DORMANT state requires private ZA interface");
|
|
assert(&MBB == &MBB.getParent()->front() &&
|
|
"CALLER_DORMANT state only valid in entry block");
|
|
emitNewZAPrologue(MBB, MBB.getFirstNonPHI());
|
|
if (To == ZAState::ACTIVE)
|
|
return; // Nothing more to do (ZA is active after the prologue).
|
|
|
|
// Note: "emitNewZAPrologue" zeros ZA, so we may need to setup a lazy save
|
|
// if "To" is "ZAState::LOCAL_SAVED". It may be possible to improve this
|
|
// case by changing the placement of the zero instruction.
|
|
From = ZAState::ACTIVE;
|
|
}
|
|
|
|
if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
|
|
emitZASave(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
|
|
else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
|
|
emitZARestore(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
|
|
else if (To == ZAState::OFF) {
|
|
assert(From != ZAState::CALLER_DORMANT &&
|
|
"CALLER_DORMANT to OFF should have already been handled");
|
|
assert(!IsAgnosticZA && "Should not turn ZA off in agnostic ZA function");
|
|
emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
|
|
} else {
|
|
dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
|
|
<< getZAStateString(To) << '\n';
|
|
llvm_unreachable("Unimplemented state transition");
|
|
}
|
|
}
|
|
|
|
} // end anonymous namespace
|
|
|
|
INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi", "Machine SME ABI",
|
|
false, false)
|
|
|
|
bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
|
|
if (!MF.getSubtarget<AArch64Subtarget>().hasSME())
|
|
return false;
|
|
|
|
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
|
|
SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
|
|
if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
|
|
!SMEFnAttrs.hasAgnosticZAInterface())
|
|
return false;
|
|
|
|
assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
|
|
|
|
// Reset pass state.
|
|
State = PassState{};
|
|
this->MF = &MF;
|
|
Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
|
|
Subtarget = &MF.getSubtarget<AArch64Subtarget>();
|
|
TII = Subtarget->getInstrInfo();
|
|
TRI = Subtarget->getRegisterInfo();
|
|
MRI = &MF.getRegInfo();
|
|
|
|
bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
|
|
|
|
collectNeededZAStates(SMEFnAttrs);
|
|
assignBundleZAStates();
|
|
insertStateChanges(/*IsAgnosticZA=*/IsAgnosticZA);
|
|
|
|
// Allocate save buffer (if needed).
|
|
if (State.HasFullZASaveRestore || State.TPIDR2Block) {
|
|
if (State.AfterSMEProloguePt) {
|
|
// Note: With inline stack probes the AfterSMEProloguePt may not be in the
|
|
// entry block (due to the probing loop).
|
|
emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
|
|
*State.AfterSMEProloguePt,
|
|
State.PhysLiveRegsAfterSMEPrologue,
|
|
/*IsAgnosticZA=*/IsAgnosticZA);
|
|
} else {
|
|
MachineBasicBlock &EntryBlock = MF.front();
|
|
emitAllocateZASaveBuffer(
|
|
EntryBlock, EntryBlock.getFirstNonPHI(),
|
|
State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry,
|
|
/*IsAgnosticZA=*/IsAgnosticZA);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
|