[RISCV] Add TunePreferAscendingLoadStore for SpacemiT X60 LD/ST fusion (#186967)

Add a tune feature that reverses the callee-saved register spill/restore
order so that addresses are ascending, enabling LD/ST fusion on
processors like SpacemiT X60.

Co-authored-by: LiqinWeng <liqin.weng@spacemit.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Mark Zhuang 2026-03-26 14:02:11 +08:00 committed by GitHub
parent 28497b7e43
commit dfab4fbb81
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 131 additions and 41 deletions

View File

@ -2031,6 +2031,12 @@ def TuneVXRMPipelineFlush
: RISCVSimpleTuneFeature<"vxrm-pipeline-flush", "HasVXRMPipelineFlush",
"true", "VXRM writes causes pipeline flush">;
def TunePreferAscendingLoadStore
: RISCVSimpleTuneFeature<"prefer-ascending-load-store",
"PreferAscendingLoadStore", "true",
"Prefer ascending load/store order for better "
"clustering and performance">;
def TunePreferVsetvliOverReadVLENB
: RISCVSimpleTuneFeature<
"prefer-vsetvli-over-read-vlenb", "PreferVsetvliOverReadVLENB",

View File

@ -568,7 +568,8 @@ uint64_t RISCVFrameLowering::getStackSizeWithRVVPadding(
static SmallVector<CalleeSavedInfo, 8>
getUnmanagedCSI(const MachineFunction &MF,
const std::vector<CalleeSavedInfo> &CSI) {
const std::vector<CalleeSavedInfo> &CSI,
bool ReverseOrder = false) {
const MachineFrameInfo &MFI = MF.getFrameInfo();
SmallVector<CalleeSavedInfo, 8> NonLibcallCSI;
@ -578,6 +579,11 @@ getUnmanagedCSI(const MachineFunction &MF,
NonLibcallCSI.push_back(CS);
}
// Reverse the order so that load/store operations use ascending addresses,
// enabling better load/store clustering and fusion.
if (ReverseOrder)
std::reverse(NonLibcallCSI.begin(), NonLibcallCSI.end());
return NonLibcallCSI;
}
@ -932,6 +938,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
MachineBasicBlock::iterator MBBI = MBB.begin();
bool PreferAscendingLS = STI.preferAscendingLoadStore();
Register BPReg = RISCVABI::getBPReg();
@ -967,8 +974,9 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// Skip to before the spills of scalar callee-saved registers
// FIXME: assumes exactly one instruction is used to restore each
// callee-saved register.
MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() +
getUnmanagedCSI(MF, CSI).size());
MBBI =
std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() +
getUnmanagedCSI(MF, CSI, PreferAscendingLS).size());
CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
bool NeedsDwarfCFI = needsDwarfCFI(MF);
@ -1087,13 +1095,14 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
// to the stack, not before.
// FIXME: assumes exactly one instruction is used to save each callee-saved
// register.
std::advance(MBBI, getUnmanagedCSI(MF, CSI).size());
std::advance(MBBI, getUnmanagedCSI(MF, CSI, PreferAscendingLS).size());
CFIBuilder.setInsertPoint(MBBI);
// Iterate over list of callee-saved registers and emit .cfi_offset
// directives.
if (NeedsDwarfCFI) {
for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) {
for (const CalleeSavedInfo &CS :
getUnmanagedCSI(MF, CSI, PreferAscendingLS)) {
MCRegister Reg = CS.getReg();
int64_t Offset = MFI.getObjectOffset(CS.getFrameIdx());
// Emit CFI for both sub-registers. The even register is at the base
@ -1242,6 +1251,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
MachineFrameInfo &MFI = MF.getFrameInfo();
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
bool PreferAscendingLS = STI.preferAscendingLoadStore();
// All calls are tail calls in GHC calling conv, and functions have no
// prologue/epilogue.
@ -1342,7 +1352,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
// Skip to after the restores of scalar callee-saved registers
// FIXME: assumes exactly one instruction is used to restore each
// callee-saved register.
MBBI = std::next(FirstScalarCSRRestoreInsn, getUnmanagedCSI(MF, CSI).size());
MBBI = std::next(FirstScalarCSRRestoreInsn,
getUnmanagedCSI(MF, CSI, PreferAscendingLS).size());
CFIBuilder.setInsertPoint(MBBI);
if (getLibCallID(MF, CSI) != -1) {
@ -1360,7 +1371,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
// Recover callee-saved registers.
if (NeedsDwarfCFI) {
for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) {
for (const CalleeSavedInfo &CS :
getUnmanagedCSI(MF, CSI, PreferAscendingLS)) {
MCRegister Reg = CS.getReg();
// Emit CFI for both sub-registers.
if (RISCV::GPRPairRegClass.contains(Reg)) {
@ -1431,7 +1443,8 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
// Callee-saved registers should be referenced relative to the stack
// pointer (positive offset), otherwise use the frame pointer (negative
// offset).
const auto &CSI = getUnmanagedCSI(MF, MFI.getCalleeSavedInfo());
const auto &CSI = getUnmanagedCSI(MF, MFI.getCalleeSavedInfo(),
STI.preferAscendingLoadStore());
int MinCSFI = 0;
int MaxCSFI = -1;
StackOffset Offset;
@ -1451,8 +1464,8 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
if (CSI.size()) {
MinCSFI = CSI[0].getFrameIdx();
MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
MinCSFI = std::min(CSI.front().getFrameIdx(), CSI.back().getFrameIdx());
MaxCSFI = std::max(CSI.front().getFrameIdx(), CSI.back().getFrameIdx());
}
if (FI >= MinCSFI && FI <= MaxCSFI) {
@ -2259,7 +2272,8 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
}
// Manually spill values not spilled by libcall & Push/Pop.
const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI);
const auto &UnmanagedCSI =
getUnmanagedCSI(*MF, CSI, STI.preferAscendingLoadStore());
const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, CSI);
auto storeRegsToStackSlots = [&](decltype(UnmanagedCSI) CSInfo) {
@ -2352,7 +2366,8 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters(
// the opportunity to avoid the load-to-use data hazard between
// loading RA and return by RA. loadRegFromStackSlot can insert
// multiple instructions.
const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI);
const auto &UnmanagedCSI =
getUnmanagedCSI(*MF, CSI, STI.preferAscendingLoadStore());
const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, CSI);
auto loadRegFromStackSlot = [&](decltype(UnmanagedCSI) CSInfo) {

View File

@ -899,7 +899,8 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
TuneOptimizedNF3SegmentLoadStore,
TuneOptimizedNF4SegmentLoadStore,
TuneEnableSelectOptimize,
TuneVXRMPipelineFlush]> {
TuneVXRMPipelineFlush,
TunePreferAscendingLoadStore]> {
let MVendorID = 0x710;
let MArchID = 0x8000000058000001;
let MImpID = 0x1000000049772200;

View File

@ -78,6 +78,7 @@
; CHECK-NEXT: optimized-zero-stride-load - Optimized (perform fewer memory operations)zero-stride vector load.
; CHECK-NEXT: permissive-zalrsc - Implementation permits non-base instructions between LR/SC pairs.
; CHECK-NEXT: predictable-select-expensive - Prefer likely predicted branches over selects.
; CHECK-NEXT: prefer-ascending-load-store - Prefer ascending load/store order for better clustering and performance.
; CHECK-NEXT: prefer-vsetvli-over-read-vlenb - Prefer vsetvli over read vlenb CSR to calculate VLEN.
; CHECK-NEXT: prefer-w-inst - Prefer instructions with W suffix.
; CHECK-NEXT: q - 'Q' (Quad-Precision Floating-Point).

View File

@ -293,16 +293,16 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64X60-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader
; RV64X60-NEXT: addi sp, sp, -48
; RV64X60-NEXT: .cfi_def_cfa_offset 48
; RV64X60-NEXT: sd s0, 40(sp) # 8-byte Folded Spill
; RV64X60-NEXT: sd s1, 32(sp) # 8-byte Folded Spill
; RV64X60-NEXT: sd s2, 24(sp) # 8-byte Folded Spill
; RV64X60-NEXT: sd s3, 16(sp) # 8-byte Folded Spill
; RV64X60-NEXT: sd s4, 8(sp) # 8-byte Folded Spill
; RV64X60-NEXT: .cfi_offset s0, -8
; RV64X60-NEXT: .cfi_offset s1, -16
; RV64X60-NEXT: .cfi_offset s2, -24
; RV64X60-NEXT: .cfi_offset s3, -32
; RV64X60-NEXT: sd s3, 16(sp) # 8-byte Folded Spill
; RV64X60-NEXT: sd s2, 24(sp) # 8-byte Folded Spill
; RV64X60-NEXT: sd s1, 32(sp) # 8-byte Folded Spill
; RV64X60-NEXT: sd s0, 40(sp) # 8-byte Folded Spill
; RV64X60-NEXT: .cfi_offset s4, -40
; RV64X60-NEXT: .cfi_offset s3, -32
; RV64X60-NEXT: .cfi_offset s2, -24
; RV64X60-NEXT: .cfi_offset s1, -16
; RV64X60-NEXT: .cfi_offset s0, -8
; RV64X60-NEXT: li t0, 0
; RV64X60-NEXT: li t1, 0
; RV64X60-NEXT: addi s1, a7, -1
@ -401,16 +401,16 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64X60-NEXT: bne s0, s2, .LBB0_10
; RV64X60-NEXT: j .LBB0_3
; RV64X60-NEXT: .LBB0_11:
; RV64X60-NEXT: ld s0, 40(sp) # 8-byte Folded Reload
; RV64X60-NEXT: ld s1, 32(sp) # 8-byte Folded Reload
; RV64X60-NEXT: ld s2, 24(sp) # 8-byte Folded Reload
; RV64X60-NEXT: ld s3, 16(sp) # 8-byte Folded Reload
; RV64X60-NEXT: ld s4, 8(sp) # 8-byte Folded Reload
; RV64X60-NEXT: .cfi_restore s0
; RV64X60-NEXT: .cfi_restore s1
; RV64X60-NEXT: .cfi_restore s2
; RV64X60-NEXT: .cfi_restore s3
; RV64X60-NEXT: ld s3, 16(sp) # 8-byte Folded Reload
; RV64X60-NEXT: ld s2, 24(sp) # 8-byte Folded Reload
; RV64X60-NEXT: ld s1, 32(sp) # 8-byte Folded Reload
; RV64X60-NEXT: ld s0, 40(sp) # 8-byte Folded Reload
; RV64X60-NEXT: .cfi_restore s4
; RV64X60-NEXT: .cfi_restore s3
; RV64X60-NEXT: .cfi_restore s2
; RV64X60-NEXT: .cfi_restore s1
; RV64X60-NEXT: .cfi_restore s0
; RV64X60-NEXT: addi sp, sp, 48
; RV64X60-NEXT: .cfi_def_cfa_offset 0
; RV64X60-NEXT: .LBB0_12: # %for.cond.cleanup

View File

@ -0,0 +1,58 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
; RUN: | FileCheck -check-prefix=NOT-X60 %s
; RUN: llc -mtriple=riscv64 -mcpu=spacemit-x60 -verify-machineinstrs < %s \
; RUN: | FileCheck -check-prefix=X60 %s
declare void @callee(ptr)
define void @test_stack_sort() {
; NOT-X60-LABEL: test_stack_sort:
; NOT-X60: # %bb.0:
; NOT-X60-NEXT: addi sp, sp, -32
; NOT-X60-NEXT: .cfi_def_cfa_offset 32
; NOT-X60-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; NOT-X60-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; NOT-X60-NEXT: .cfi_offset ra, -8
; NOT-X60-NEXT: .cfi_offset s0, -16
; NOT-X60-NEXT: addi s0, sp, 32
; NOT-X60-NEXT: .cfi_def_cfa s0, 0
; NOT-X60-NEXT: andi sp, sp, -32
; NOT-X60-NEXT: mv a0, sp
; NOT-X60-NEXT: call callee
; NOT-X60-NEXT: addi sp, s0, -32
; NOT-X60-NEXT: .cfi_def_cfa sp, 32
; NOT-X60-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; NOT-X60-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; NOT-X60-NEXT: .cfi_restore ra
; NOT-X60-NEXT: .cfi_restore s0
; NOT-X60-NEXT: addi sp, sp, 32
; NOT-X60-NEXT: .cfi_def_cfa_offset 0
; NOT-X60-NEXT: ret
;
; X60-LABEL: test_stack_sort:
; X60: # %bb.0:
; X60-NEXT: addi sp, sp, -32
; X60-NEXT: .cfi_def_cfa_offset 32
; X60-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
; X60-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
; X60-NEXT: .cfi_offset s0, -16
; X60-NEXT: .cfi_offset ra, -8
; X60-NEXT: addi s0, sp, 32
; X60-NEXT: .cfi_def_cfa s0, 0
; X60-NEXT: andi sp, sp, -32
; X60-NEXT: mv a0, sp
; X60-NEXT: call callee
; X60-NEXT: addi sp, s0, -32
; X60-NEXT: .cfi_def_cfa sp, 32
; X60-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
; X60-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
; X60-NEXT: .cfi_restore s0
; X60-NEXT: .cfi_restore ra
; X60-NEXT: addi sp, sp, 32
; X60-NEXT: .cfi_def_cfa_offset 0
; X60-NEXT: ret
%1 = alloca i8, align 32
call void @callee(ptr %1)
ret void
}

View File

@ -44,18 +44,27 @@ TEST(RISCVTuneFeature, AllTuneFeatures) {
RISCV::getAllTuneFeatures(AllTuneFeatures);
// Only allowed subtarget features that are explicitly marked by
// special TableGen class.
EXPECT_EQ(AllTuneFeatures.size(), 19U);
for (auto F :
{"conditional-cmv-fusion", "disable-latency-sched-heuristic",
"disable-misched-load-clustering", "disable-misched-store-clustering",
"disable-postmisched-load-clustering",
"disable-postmisched-store-clustering", "single-element-vec-fp64",
"no-default-unroll", "no-sink-splat-operands", "use-postra-scheduler",
"predictable-select-expensive", "prefer-vsetvli-over-read-vlenb",
"prefer-w-inst", "short-forward-branch-ialu",
"short-forward-branch-iminmax", "short-forward-branch-imul",
"short-forward-branch-iload", "vl-dependent-latency",
"vxrm-pipeline-flush"})
EXPECT_EQ(AllTuneFeatures.size(), 20U);
for (auto F : {"conditional-cmv-fusion",
"disable-latency-sched-heuristic",
"disable-misched-load-clustering",
"disable-misched-store-clustering",
"disable-postmisched-load-clustering",
"disable-postmisched-store-clustering",
"single-element-vec-fp64",
"no-default-unroll",
"no-sink-splat-operands",
"use-postra-scheduler",
"predictable-select-expensive",
"prefer-ascending-load-store",
"prefer-vsetvli-over-read-vlenb",
"prefer-w-inst",
"short-forward-branch-ialu",
"short-forward-branch-iminmax",
"short-forward-branch-imul",
"short-forward-branch-iload",
"vl-dependent-latency",
"vxrm-pipeline-flush"})
EXPECT_TRUE(is_contained(AllTuneFeatures, F));
}