Compare commits

...

5 Commits

Author SHA1 Message Date
Benjamin Maxwell
2d5441cc1f [AArch64][SME] Avoid ZA save state changes in loops in MachineSMEABIPass
This patch uses the MachineLoopInfo to give blocks within loops a higher
weight when choosing the bundle ZA state. MachineLoopInfo does not find
loop trip counts, so this uses an arbitrary weight (default 10), which
can be configured with the `-aarch64-sme-abi-loop-edge-weight` flag.

This makes the MachineSMEABIPass pass more likely to pick a bundle state
that matches the loop's entry/exit state, which avoids state changes in
the loop (which we assume will happen more than once).

This does require some extra analysis, so this is only enabled at -O1
and above.

Change-Id: If318c809d2f7cc1fca144fbe424ba2a2ca7fb19f
2025-08-20 13:28:16 +00:00
Benjamin Maxwell
ea77b25e78 Fixup checks
Change-Id: I94018ed55c302de670f7a0b25fd28605d9bed2b6
2025-08-20 13:24:43 +00:00
Benjamin Maxwell
d8a8c4e3e0 [AArch64][SME] Support agnostic ZA functions in the MachineSMEABIPass
This extends the MachineSMEABIPass to handle agnostic ZA functions. This
case is currently handled like shared ZA functions, but we don't require
ZA state to be reloaded before agnostic ZA calls.

Note: This patch does not yet fully handle agnostic ZA functions that
can catch exceptions. E.g.:

```
__arm_agnostic("sme_za_state") void try_catch_agnostic_za_callee()
{
  try {
    agnostic_za_call();
  } catch(...) {
    noexcept_agnostic_za_call();
  }
}
```

As in this case, we won't commit a ZA save before the
`agnostic_za_call()`, which would be needed to restore ZA in the catch
block. This will be handled in a later patch.

Change-Id: I9cce7b42ec8b64d5442b35231b65dfaf9d149eed
2025-08-20 13:15:45 +00:00
Benjamin Maxwell
142af7d225 Update comment
Change-Id: I5dca6eaca8613a33e89a5cec9cc7d2c0f9cc7fb5
2025-08-20 13:13:00 +00:00
Benjamin Maxwell
9799316c58 [AArch64][SME] Support Windows/stack probes in MachineSMEABIPass
On Windows or with stack probes on other targets, additional code needs
to be inserted after dynamic stack allocations to validate stack
accesses and/or ensure enough stack space has been allocated.

Rather than handle this case in the MachineSMEABIPass (like we do for
the standard case), we allocate the memory for the lazy save buffer in
SelectionDAG, which allows the existing expansions to emit the correct
code.

Note: This means in these cases, we may allocate a lazy save buffer when
there are no lazy saves present in the function (as we have to allocate
the buffer before the MachineSMEABIPass runs).

Change-Id: If89ab54c4de79f6fe5513a6b387e9e349f7bc7d1
2025-08-20 13:12:59 +00:00
12 changed files with 636 additions and 79 deletions

View File

@ -60,7 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
FunctionPass *createAArch64CollectLOHPass();
FunctionPass *createSMEABIPass();
FunctionPass *createSMEPeepholeOptPass();
FunctionPass *createMachineSMEABIPass();
FunctionPass *createMachineSMEABIPass(CodeGenOptLevel);
ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,

View File

@ -1688,6 +1688,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
}
case AArch64::InOutZAUsePseudo:
case AArch64::RequiresZASavePseudo:
case AArch64::SMEStateAllocPseudo:
case AArch64::COALESCER_BARRIER_FPR16:
case AArch64::COALESCER_BARRIER_FPR32:
case AArch64::COALESCER_BARRIER_FPR64:

View File

@ -8292,7 +8292,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
if (getTM().useNewSMEABILowering()) {
if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
SDValue Size;
if (Attrs.hasZAState()) {
SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
DAG.getConstant(1, DL, MVT::i32));
Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
} else if (Attrs.hasAgnosticZAInterface()) {
RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
SDValue Callee = DAG.getExternalSymbol(
getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
getLibcallCallingConv(LC), RetTy, Callee, {});
std::tie(Size, Chain) = LowerCallTo(CLI);
}
if (Size) {
SDValue Buffer = DAG.getNode(
ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
{Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
Chain = Buffer.getValue(1);
Register BufferPtr =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
DAG.getVTList(MVT::Other), Chain);
FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
MFI.CreateVariableSizedObject(Align(16), nullptr);
}
}
} else {
// Old SME ABI lowering (deprecated):
// Create a 16 Byte TPIDR2 object. The dynamic buffer
// will be expanded and stored in the static object later using a
@ -9081,9 +9113,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
// TODO: Handle agnostic ZA functions.
if (!UseNewSMEABILowering || IsAgnosticZAFunction)
if (!UseNewSMEABILowering)
return std::nullopt;
if (IsAgnosticZAFunction) {
if (CallAttrs.requiresPreservingAllZAState())
return AArch64ISD::REQUIRES_ZA_SAVE;
return std::nullopt;
}
if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
return std::nullopt;
return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
@ -9163,7 +9199,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
};
bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
bool RequiresSaveAllZA =
!UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
if (RequiresLazySave) {
const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
MachinePointerInfo MPI =

View File

@ -238,6 +238,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// Holds the SME function attributes (streaming mode, ZA/ZT0 state).
SMEAttrs SMEFnAttrs;
// Holds the TPIDR2 block if allocated early (for Windows/stack probes
// support).
Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;
// Note: The following properties are only used for the old SME ABI lowering:
/// The frame-index for the TPIDR2 object used for lazy saves.
TPIDR2Object TPIDR2;
@ -256,6 +260,12 @@ public:
const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
const override;
void setEarlyAllocSMESaveBuffer(Register Ptr) {
EarlyAllocSMESaveBuffer = Ptr;
}
Register getEarlyAllocSMESaveBuffer() { return EarlyAllocSMESaveBuffer; }
// Old SME ABI lowering state getters/setters:
Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };

View File

@ -93,6 +93,8 @@ let hasSideEffects = 1, isMeta = 1 in {
def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
}
def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
def CommitZASavePseudo
: Pseudo<(outs),
(ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>,
@ -108,6 +110,11 @@ def AArch64_requires_za_save
[SDNPHasChain, SDNPInGlue]>;
def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
def AArch64_sme_state_alloc
: SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
[SDNPHasChain]>;
def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//

View File

@ -791,8 +791,8 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
}
void AArch64PassConfig::addMachineSSAOptimization() {
if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None)
addPass(createMachineSMEABIPass());
if (TM->getOptLevel() != CodeGenOptLevel::None && EnableNewSMEABILowering)
addPass(createMachineSMEABIPass(TM->getOptLevel()));
if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt)
addPass(createSMEPeepholeOptPass());
@ -825,7 +825,7 @@ bool AArch64PassConfig::addILPOpts() {
void AArch64PassConfig::addPreRegAlloc() {
if (TM->getOptLevel() == CodeGenOptLevel::None && EnableNewSMEABILowering)
addPass(createMachineSMEABIPass());
addPass(createMachineSMEABIPass(CodeGenOptLevel::None));
// Change dead register definitions to refer to the zero register.
if (TM->getOptLevel() != CodeGenOptLevel::None &&

View File

@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
//
// This pass implements the SME ABI requirements for ZA state. This includes
// implementing the lazy ZA state save schemes around calls.
// implementing the lazy (and agnostic) ZA state save schemes around calls.
//
//===----------------------------------------------------------------------===//
//
@ -63,6 +63,7 @@
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@ -70,6 +71,12 @@ using namespace llvm;
#define DEBUG_TYPE "aarch64-machine-sme-abi"
static cl::opt<int>
LoopEdgeWeight("aarch64-sme-abi-loop-edge-weight", cl::ReallyHidden,
cl::init(10),
cl::desc("Edge weight for basic blocks witin loops (used "
"for placing ZA saves/restores)"));
namespace {
enum ZAState {
@ -176,7 +183,8 @@ getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
struct MachineSMEABI : public MachineFunctionPass {
inline static char ID = 0;
MachineSMEABI() : MachineFunctionPass(ID) {}
MachineSMEABI(CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
: MachineFunctionPass(ID), OptLevel(OptLevel) {}
bool runOnMachineFunction(MachineFunction &MF) override;
@ -185,6 +193,9 @@ struct MachineSMEABI : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<EdgeBundlesWrapperLegacy>();
// Only analyse loops at -01 and above.
if (OptLevel != CodeGenOptLevel::None)
AU.addRequired<MachineLoopInfoWrapperPass>();
AU.addPreservedID(MachineLoopInfoID);
AU.addPreservedID(MachineDominatorsID);
MachineFunctionPass::getAnalysisUsage(AU);
@ -200,7 +211,7 @@ struct MachineSMEABI : public MachineFunctionPass {
/// Inserts code to handle changes between ZA states within the function.
/// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
void insertStateChanges();
void insertStateChanges(bool IsAgnosticZA);
// Emission routines for private and shared ZA functions (using lazy saves).
void emitNewZAPrologue(MachineBasicBlock &MBB,
@ -215,8 +226,41 @@ struct MachineSMEABI : public MachineFunctionPass {
void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
bool ClearTPIDR2);
// Emission routines for agnostic ZA functions.
void emitSetupFullZASave(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs);
void emitFullZASaveRestore(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs, bool IsSave);
void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs);
void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
ZAState From, ZAState To, LiveRegs PhysLiveRegs);
ZAState From, ZAState To, LiveRegs PhysLiveRegs,
bool IsAgnosticZA);
// Helpers for switching between lazy/full ZA save/restore routines.
void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
if (IsAgnosticZA)
return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true);
return emitSetupLazySave(MBB, MBBI);
}
void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
if (IsAgnosticZA)
return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false);
return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs);
}
void emitAllocateZASaveBuffer(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
if (IsAgnosticZA)
return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs);
return emitAllocateLazySaveBuffer(MBB, MBBI);
}
/// Save live physical registers to virtual registers.
PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
@ -228,6 +272,8 @@ struct MachineSMEABI : public MachineFunctionPass {
/// Get or create a TPIDR2 block in this function.
TPIDR2State getTPIDR2Block();
Register getAgnosticZABufferPtr();
private:
/// Contains the needed ZA state (and live registers) at an instruction.
struct InstInfo {
@ -241,14 +287,21 @@ private:
struct BlockInfo {
ZAState FixedEntryState{ZAState::ANY};
SmallVector<InstInfo> Insts;
LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
};
CodeGenOptLevel OptLevel = CodeGenOptLevel::Default;
// All pass state that must be cleared between functions.
struct PassState {
SmallVector<BlockInfo> Blocks;
SmallVector<ZAState> BundleStates;
std::optional<TPIDR2State> TPIDR2Block;
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
Register AgnosticZABufferPtr = AArch64::NoRegister;
LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
bool HasFullZASaveRestore = false;
} State;
MachineFunction *MF = nullptr;
@ -257,10 +310,12 @@ private:
const AArch64RegisterInfo *TRI = nullptr;
const TargetInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
MachineLoopInfo *MLI = nullptr;
};
void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) &&
assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
SMEFnAttrs.hasZAState()) &&
"Expected function to have ZA/ZT0 state!");
State.Blocks.resize(MF->getNumBlockIDs());
@ -294,10 +349,20 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
for (MachineInstr &MI : reverse(MBB)) {
MachineBasicBlock::iterator MBBI(MI);
LiveUnits.stepBackward(MI);
LiveRegs PhysLiveRegs = GetPhysLiveRegs();
// The SMEStateAllocPseudo marker is added to a function if the save
// buffer was allocated in SelectionDAG. It marks the end of the
// allocation -- which is a safe point for this pass to insert any TPIDR2
// block setup.
if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
State.AfterSMEProloguePt = MBBI;
State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
}
// Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
auto [NeededState, InsertPt] = getZAStateBeforeInst(
*TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
assert((InsertPt == MBBI ||
@ -306,6 +371,8 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
// TODO: Do something to avoid state changes where NZCV is live.
if (MBBI == FirstTerminatorInsertPt)
Block.PhysLiveRegsAtExit = PhysLiveRegs;
if (MBBI == FirstNonPhiInsertPt)
Block.PhysLiveRegsAtEntry = PhysLiveRegs;
if (NeededState != ZAState::ANY)
Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
}
@ -334,18 +401,23 @@ void MachineSMEABI::assignBundleZAStates() {
LLVM_DEBUG(dbgs() << " (no state preference)\n");
continue;
}
bool IsLoop = MLI && MLI->getLoopFor(MF->getBlockNumbered(BlockID));
bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
int EdgeWeight = IsLoop ? LoopEdgeWeight : 1;
if (IsLoop)
LLVM_DEBUG(dbgs() << " IsLoop");
LLVM_DEBUG(dbgs() << " (EdgeWeight: " << EdgeWeight << ')');
ZAState DesiredIncomingState = Block.Insts.front().NeededState;
if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
EdgeStateCounts[DesiredIncomingState]++;
EdgeStateCounts[DesiredIncomingState] += EdgeWeight;
LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
<< getZAStateString(DesiredIncomingState));
}
ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
EdgeStateCounts[DesiredOutgoingState]++;
EdgeStateCounts[DesiredOutgoingState] += EdgeWeight;
LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
<< getZAStateString(DesiredOutgoingState));
}
@ -372,7 +444,7 @@ void MachineSMEABI::assignBundleZAStates() {
}
}
void MachineSMEABI::insertStateChanges() {
void MachineSMEABI::insertStateChanges(bool IsAgnosticZA) {
for (MachineBasicBlock &MBB : *MF) {
const BlockInfo &Block = State.Blocks[MBB.getNumber()];
ZAState InState = State.BundleStates[Bundles->getBundle(MBB.getNumber(),
@ -385,7 +457,7 @@ void MachineSMEABI::insertStateChanges() {
for (auto &Inst : Block.Insts) {
if (CurrentState != Inst.NeededState)
emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState,
Inst.PhysLiveRegs);
Inst.PhysLiveRegs, IsAgnosticZA);
CurrentState = Inst.NeededState;
}
@ -396,7 +468,7 @@ void MachineSMEABI::insertStateChanges() {
State.BundleStates[Bundles->getBundle(MBB.getNumber(), /*Out=*/true)];
if (CurrentState != OutState)
emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState,
Block.PhysLiveRegsAtExit);
Block.PhysLiveRegsAtExit, IsAgnosticZA);
}
}
@ -529,23 +601,27 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
void MachineSMEABI::emitAllocateLazySaveBuffer(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
MachineFrameInfo &MFI = MF->getFrameInfo();
auto *AFI = MF->getInfo<AArch64FunctionInfo>();
DebugLoc DL = getDebugLoc(MBB, MBBI);
Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
// Calculate SVL.
BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
// 1. Allocate the lazy save buffer.
{
// TODO This function grows the stack with a subtraction, which doesn't work
// on Windows. Some refactoring to share the functionality in
// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
// supports SME
if (Buffer == AArch64::NoRegister) {
// TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
// Buffer != AArch64::NoRegister). This is done to reuse the existing
// expansions (which can insert stack checks). This works, but it means we
// will always allocate the lazy save buffer (even if the function contains
// no lazy saves). If we want to handle Windows here, we'll need to
// implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
assert(!Subtarget->isTargetWindows() &&
"Lazy ZA save is not yet supported on Windows");
Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
// Get original stack pointer.
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
.addReg(AArch64::SP);
@ -606,10 +682,95 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
.addImm(1);
}
Register MachineSMEABI::getAgnosticZABufferPtr() {
if (State.AgnosticZABufferPtr != AArch64::NoRegister)
return State.AgnosticZABufferPtr;
if (auto BufferPtr =
MF->getInfo<AArch64FunctionInfo>()->getEarlyAllocSMESaveBuffer();
BufferPtr != AArch64::NoRegister)
State.AgnosticZABufferPtr = BufferPtr;
else
State.AgnosticZABufferPtr =
MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
return State.AgnosticZABufferPtr;
}
void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs, bool IsSave) {
auto *TLI = Subtarget->getTargetLowering();
State.HasFullZASaveRestore = true;
DebugLoc DL = getDebugLoc(MBB, MBBI);
Register BufferPtr = AArch64::X0;
PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
// Copy the buffer pointer into X0.
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
.addReg(getAgnosticZABufferPtr());
// Call __arm_sme_save/__arm_sme_restore.
BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
.addReg(BufferPtr, RegState::Implicit)
.addExternalSymbol(TLI->getLibcallName(
IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE))
.addRegMask(TRI->getCallPreservedMask(
*MF,
CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
restorePhyRegSave(RegSave, MBB, MBBI, DL);
}
void MachineSMEABI::emitAllocateFullZASaveBuffer(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs) {
auto *AFI = MF->getInfo<AArch64FunctionInfo>();
// Buffer already allocated in SelectionDAG.
if (AFI->getEarlyAllocSMESaveBuffer())
return;
DebugLoc DL = getDebugLoc(MBB, MBBI);
Register BufferPtr = getAgnosticZABufferPtr();
Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
// Calculate the SME state size.
{
auto *TLI = Subtarget->getTargetLowering();
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
.addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_SME_STATE_SIZE))
.addReg(AArch64::X0, RegState::ImplicitDefine)
.addRegMask(TRI->getCallPreservedMask(
*MF, CallingConv::
AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferSize)
.addReg(AArch64::X0);
}
// Allocate a buffer object of the size given __arm_sme_state_size.
{
MachineFrameInfo &MFI = MF->getFrameInfo();
BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
.addReg(AArch64::SP)
.addReg(BufferSize)
.addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
.addReg(AArch64::SP);
// We have just allocated a variable sized object, tell this to PEI.
MFI.CreateVariableSizedObject(Align(16), nullptr);
}
restorePhyRegSave(RegSave, MBB, MBBI, DL);
}
void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt,
ZAState From, ZAState To,
LiveRegs PhysLiveRegs) {
LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
// ZA not used.
if (From == ZAState::ANY || To == ZAState::ANY)
@ -641,12 +802,13 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
}
if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
emitSetupLazySave(MBB, InsertPt);
emitZASave(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs);
emitZARestore(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
else if (To == ZAState::OFF) {
assert(From != ZAState::CALLER_DORMANT &&
"CALLER_DORMANT to OFF should have already been handled");
assert(!IsAgnosticZA && "Should not turn ZA off in agnostic ZA function");
emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
} else {
dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
@ -666,7 +828,8 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State())
if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
!SMEFnAttrs.hasAgnosticZAInterface())
return false;
assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
@ -679,18 +842,36 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
TII = Subtarget->getInstrInfo();
TRI = Subtarget->getRegisterInfo();
MRI = &MF.getRegInfo();
if (OptLevel != CodeGenOptLevel::None)
MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
collectNeededZAStates(SMEFnAttrs);
assignBundleZAStates();
insertStateChanges();
insertStateChanges(/*IsAgnosticZA=*/IsAgnosticZA);
// Allocate save buffer (if needed).
if (State.TPIDR2Block) {
MachineBasicBlock &EntryBlock = MF.front();
emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
if (State.HasFullZASaveRestore || State.TPIDR2Block) {
if (State.AfterSMEProloguePt) {
// Note: With inline stack probes the AfterSMEProloguePt may not be in the
// entry block (due to the probing loop).
emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
*State.AfterSMEProloguePt,
State.PhysLiveRegsAfterSMEPrologue,
/*IsAgnosticZA=*/IsAgnosticZA);
} else {
MachineBasicBlock &EntryBlock = MF.front();
emitAllocateZASaveBuffer(
EntryBlock, EntryBlock.getFirstNonPHI(),
State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry,
/*IsAgnosticZA=*/IsAgnosticZA);
}
}
return true;
}
FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
FunctionPass *llvm::createMachineSMEABIPass(CodeGenOptLevel OptLevel) {
return new MachineSMEABI(OptLevel);
}

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mattr=+sme2 < %s | FileCheck %s
; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s
; RUN: llc -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
target triple = "aarch64"
@ -9,10 +9,10 @@ declare i64 @agnostic_decl(i64) "aarch64_za_state_agnostic"
; No calls. Test that no buffer is allocated.
define i64 @agnostic_caller_no_callees(ptr %ptr) nounwind "aarch64_za_state_agnostic" {
; CHECK-LABEL: agnostic_caller_no_callees:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr x0, [x0]
; CHECK-NEXT: ret
; CHECK-COMMON-LABEL: agnostic_caller_no_callees:
; CHECK-COMMON: // %bb.0:
; CHECK-COMMON-NEXT: ldr x0, [x0]
; CHECK-COMMON-NEXT: ret
%v = load i64, ptr %ptr
ret i64 %v
}
@ -51,6 +51,29 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: agnostic_caller_private_za_callee:
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x29, sp
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
; CHECK-NEWLOWERING-NEXT: mov x19, sp
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
ret i64 %res2
@ -60,12 +83,12 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state
;
; Should not result in save/restore code.
define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind "aarch64_za_state_agnostic" {
; CHECK-LABEL: agnostic_caller_agnostic_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: bl agnostic_decl
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-COMMON-LABEL: agnostic_caller_agnostic_callee:
; CHECK-COMMON: // %bb.0:
; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-COMMON-NEXT: bl agnostic_decl
; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ret
%res = call i64 @agnostic_decl(i64 %v)
ret i64 %res
}
@ -74,12 +97,12 @@ define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind "aarch64_za_state_a
;
; Should not result in lazy-save or save of ZT0
define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "aarch64_inout_zt0" {
; CHECK-LABEL: shared_caller_agnostic_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: bl agnostic_decl
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-COMMON-LABEL: shared_caller_agnostic_callee:
; CHECK-COMMON: // %bb.0:
; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-COMMON-NEXT: bl agnostic_decl
; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ret
%res = call i64 @agnostic_decl(i64 %v)
ret i64 %res
}
@ -126,6 +149,41 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee:
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: add x29, sp, #64
; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
; CHECK-NEWLOWERING-NEXT: mov x20, sp
; CHECK-NEWLOWERING-NEXT: mov x0, x20
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x20
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
ret i64 %res2
@ -187,6 +245,55 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee:
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: add x29, sp, #64
; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
; CHECK-NEWLOWERING-NEXT: mov x19, sp
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state
; CHECK-NEWLOWERING-NEXT: mov x20, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_2
; CHECK-NEWLOWERING-NEXT: // %bb.1:
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: .LBB5_2:
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4
; CHECK-NEWLOWERING-NEXT: // %bb.3:
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: .LBB5_4:
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6
; CHECK-NEWLOWERING-NEXT: // %bb.5:
; CHECK-NEWLOWERING-NEXT: smstop sm
; CHECK-NEWLOWERING-NEXT: .LBB5_6:
; CHECK-NEWLOWERING-NEXT: bl private_za_decl
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8
; CHECK-NEWLOWERING-NEXT: // %bb.7:
; CHECK-NEWLOWERING-NEXT: smstart sm
; CHECK-NEWLOWERING-NEXT: .LBB5_8:
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
ret i64 %res2
@ -223,6 +330,31 @@ define i64 @test_many_callee_arguments(
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: test_many_callee_arguments:
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x29, sp
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
; CHECK-NEWLOWERING-NEXT: mov x19, sp
; CHECK-NEWLOWERING-NEXT: ldp x9, x10, [x29, #32]
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
; CHECK-NEWLOWERING-NEXT: stp x9, x10, [sp, #-16]!
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee
; CHECK-NEWLOWERING-NEXT: add sp, sp, #16
; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9
) nounwind "aarch64_za_state_agnostic" {
%ret = call i64 @many_args_private_za_callee(

View File

@ -0,0 +1,115 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O0
; RUN: llc -O1 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O1
declare void @private_za_call()
declare void @shared_za_call() "aarch64_inout_za"
; This test checks that at -O0 we don't attempt to optimize lazy save state
; changes in loops, and that -O1 (and above) we attempt to push state changes
; out of loops.
define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
; CHECK-O0-LABEL: private_za_loop_active_entry_and_exit:
; CHECK-O0: // %bb.0: // %entry
; CHECK-O0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-O0-NEXT: mov x29, sp
; CHECK-O0-NEXT: sub sp, sp, #32
; CHECK-O0-NEXT: rdsvl x9, #1
; CHECK-O0-NEXT: mov x8, sp
; CHECK-O0-NEXT: msub x8, x9, x9, x8
; CHECK-O0-NEXT: mov sp, x8
; CHECK-O0-NEXT: stp x8, x9, [x29, #-16]
; CHECK-O0-NEXT: stur w0, [x29, #-24] // 4-byte Folded Spill
; CHECK-O0-NEXT: bl shared_za_call
; CHECK-O0-NEXT: ldur w0, [x29, #-24] // 4-byte Folded Reload
; CHECK-O0-NEXT: mov w8, wzr
; CHECK-O0-NEXT: subs w9, w0, #1
; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill
; CHECK-O0-NEXT: b.lt .LBB0_4
; CHECK-O0-NEXT: b .LBB0_1
; CHECK-O0-NEXT: .LBB0_1: // %loop
; CHECK-O0-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-O0-NEXT: ldur w8, [x29, #-20] // 4-byte Folded Reload
; CHECK-O0-NEXT: stur w8, [x29, #-28] // 4-byte Folded Spill
; CHECK-O0-NEXT: sub x8, x29, #16
; CHECK-O0-NEXT: msr TPIDR2_EL0, x8
; CHECK-O0-NEXT: bl private_za_call
; CHECK-O0-NEXT: ldur w8, [x29, #-28] // 4-byte Folded Reload
; CHECK-O0-NEXT: ldur w10, [x29, #-24] // 4-byte Folded Reload
; CHECK-O0-NEXT: add w9, w8, #1
; CHECK-O0-NEXT: mov w8, w9
; CHECK-O0-NEXT: subs w9, w9, w10
; CHECK-O0-NEXT: mrs x9, NZCV
; CHECK-O0-NEXT: smstart za
; CHECK-O0-NEXT: mrs x10, TPIDR2_EL0
; CHECK-O0-NEXT: sub x0, x29, #16
; CHECK-O0-NEXT: cbz x10, .LBB0_2
; CHECK-O0-NEXT: b .LBB0_3
; CHECK-O0-NEXT: .LBB0_2: // %loop
; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-O0-NEXT: bl __arm_tpidr2_restore
; CHECK-O0-NEXT: b .LBB0_3
; CHECK-O0-NEXT: .LBB0_3: // %loop
; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1
; CHECK-O0-NEXT: msr TPIDR2_EL0, xzr
; CHECK-O0-NEXT: msr NZCV, x9
; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill
; CHECK-O0-NEXT: b.ne .LBB0_1
; CHECK-O0-NEXT: b .LBB0_4
; CHECK-O0-NEXT: .LBB0_4: // %exit
; CHECK-O0-NEXT: mov sp, x29
; CHECK-O0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-O0-NEXT: b shared_za_call
;
; CHECK-O1-LABEL: private_za_loop_active_entry_and_exit:
; CHECK-O1: // %bb.0: // %entry
; CHECK-O1-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
; CHECK-O1-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-O1-NEXT: mov x29, sp
; CHECK-O1-NEXT: sub sp, sp, #16
; CHECK-O1-NEXT: rdsvl x8, #1
; CHECK-O1-NEXT: mov x9, sp
; CHECK-O1-NEXT: msub x9, x8, x8, x9
; CHECK-O1-NEXT: mov sp, x9
; CHECK-O1-NEXT: mov w19, w0
; CHECK-O1-NEXT: stp x9, x8, [x29, #-16]
; CHECK-O1-NEXT: bl shared_za_call
; CHECK-O1-NEXT: cmp w19, #1
; CHECK-O1-NEXT: sub x8, x29, #16
; CHECK-O1-NEXT: msr TPIDR2_EL0, x8
; CHECK-O1-NEXT: b.lt .LBB0_2
; CHECK-O1-NEXT: .LBB0_1: // %loop
; CHECK-O1-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-O1-NEXT: bl private_za_call
; CHECK-O1-NEXT: subs w19, w19, #1
; CHECK-O1-NEXT: b.ne .LBB0_1
; CHECK-O1-NEXT: .LBB0_2: // %exit
; CHECK-O1-NEXT: smstart za
; CHECK-O1-NEXT: mrs x8, TPIDR2_EL0
; CHECK-O1-NEXT: sub x0, x29, #16
; CHECK-O1-NEXT: cbnz x8, .LBB0_4
; CHECK-O1-NEXT: // %bb.3: // %exit
; CHECK-O1-NEXT: bl __arm_tpidr2_restore
; CHECK-O1-NEXT: .LBB0_4: // %exit
; CHECK-O1-NEXT: msr TPIDR2_EL0, xzr
; CHECK-O1-NEXT: mov sp, x29
; CHECK-O1-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-O1-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-O1-NEXT: b shared_za_call
entry:
%cmpgt = icmp sgt i32 %n, 0
tail call void @shared_za_call()
br i1 %cmpgt, label %loop, label %exit
loop:
%iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ]
tail call void @private_za_call()
%next_iv = add nuw nsw i32 %iv, 1
%cmpeq = icmp eq i32 %next_iv, %n
br i1 %cmpeq, label %exit, label %loop
exit:
tail call void @shared_za_call()
ret void
}

View File

@ -0,0 +1,71 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING
declare void @private_za_callee()
declare void @shared_za_callee() "aarch64_inout_za"
define void @test_lazy_save() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mul x9, x8, x8
; CHECK-NEXT: lsr x15, x9, #4
; CHECK-NEXT: bl __chkstk
; CHECK-NEXT: sub x9, sp, x15, lsl #4
; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: stur x9, [x29, #-16]
; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w8, [x29, #-8]
; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
; CHECK-NEXT: cbnz x8, .LBB0_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: test_lazy_save:
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x29, sp
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
; CHECK-NEWLOWERING-NEXT: mul x9, x8, x8
; CHECK-NEWLOWERING-NEXT: lsr x15, x9, #4
; CHECK-NEWLOWERING-NEXT: bl __chkstk
; CHECK-NEWLOWERING-NEXT: sub x9, sp, x15, lsl #4
; CHECK-NEWLOWERING-NEXT: mov sp, x9
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEWLOWERING-NEXT: bl private_za_callee
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_2
; CHECK-NEWLOWERING-NEXT: // %bb.1:
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
; CHECK-NEWLOWERING-NEXT: .LBB0_2:
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
call void @private_za_callee()
ret void
}

View File

@ -102,7 +102,7 @@ exit:
ret void
}
; FIXME: In the new lowering we could weight edges to avoid doing the lazy save in the loop.
; This tests that with the new lowering we push state changes out of loops (at -O1 and above).
define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind {
; CHECK-LABEL: private_za_loop_active_entry_and_exit:
; CHECK: // %bb.0: // %entry
@ -154,7 +154,7 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: mov x29, sp
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
@ -165,30 +165,25 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEWLOWERING-NEXT: bl shared_za_call
; CHECK-NEWLOWERING-NEXT: cmp w19, #1
; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_5
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader
; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16
; CHECK-NEWLOWERING-NEXT: b .LBB1_3
; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %loop
; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: cbz w19, .LBB1_5
; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %loop
; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_2
; CHECK-NEWLOWERING-NEXT: .LBB1_1: // %loop
; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEWLOWERING-NEXT: bl private_za_call
; CHECK-NEWLOWERING-NEXT: sub w19, w19, #1
; CHECK-NEWLOWERING-NEXT: subs w19, w19, #1
; CHECK-NEWLOWERING-NEXT: b.ne .LBB1_1
; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %exit
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %loop
; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_4
; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
; CHECK-NEWLOWERING-NEXT: b .LBB1_2
; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit
; CHECK-NEWLOWERING-NEXT: .LBB1_4: // %exit
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: b shared_za_call
entry:

View File

@ -103,7 +103,6 @@ exit:
ret float %ret
}
; FIXME: This is missing stack probes with -aarch64-new-sme-abi.
define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
; CHECK: // %bb.0:
@ -165,26 +164,35 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
; CHECK-NEWLOWERING-NEXT: mov x9, sp
; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9
; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEWLOWERING-NEXT: cmp sp, x9
; CHECK-NEWLOWERING-NEXT: b.le .LBB2_3
; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
; CHECK-NEWLOWERING-NEXT: b .LBB2_1
; CHECK-NEWLOWERING-NEXT: .LBB2_3:
; CHECK-NEWLOWERING-NEXT: mov sp, x9
; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_2
; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b
; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b
; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000
; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1
; CHECK-NEWLOWERING-NEXT: b .LBB2_3
; CHECK-NEWLOWERING-NEXT: .LBB2_2: // %use_c
; CHECK-NEWLOWERING-NEXT: b .LBB2_6
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %use_c
; CHECK-NEWLOWERING-NEXT: fmov s0, s1
; CHECK-NEWLOWERING-NEXT: bl cosf
; CHECK-NEWLOWERING-NEXT: .LBB2_3: // %exit
; CHECK-NEWLOWERING-NEXT: .LBB2_6: // %exit
; CHECK-NEWLOWERING-NEXT: smstart za
; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_5
; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit
; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB2_8
; CHECK-NEWLOWERING-NEXT: // %bb.7: // %exit
; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
; CHECK-NEWLOWERING-NEXT: .LBB2_5: // %exit
; CHECK-NEWLOWERING-NEXT: .LBB2_8: // %exit
; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEWLOWERING-NEXT: mov sp, x29
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload