[RISCV] Add TunePreferAscendingLoadStore for SpacemiT X60 LD/ST fusion (#186967)
Add a tune feature that reverses the callee-saved register spill/restore order so that addresses are ascending, enabling LD/ST fusion on processors like SpacemiT X60. Co-authored-by: LiqinWeng <liqin.weng@spacemit.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
28497b7e43
commit
dfab4fbb81
@ -2031,6 +2031,12 @@ def TuneVXRMPipelineFlush
|
||||
: RISCVSimpleTuneFeature<"vxrm-pipeline-flush", "HasVXRMPipelineFlush",
|
||||
"true", "VXRM writes causes pipeline flush">;
|
||||
|
||||
def TunePreferAscendingLoadStore
|
||||
: RISCVSimpleTuneFeature<"prefer-ascending-load-store",
|
||||
"PreferAscendingLoadStore", "true",
|
||||
"Prefer ascending load/store order for better "
|
||||
"clustering and performance">;
|
||||
|
||||
def TunePreferVsetvliOverReadVLENB
|
||||
: RISCVSimpleTuneFeature<
|
||||
"prefer-vsetvli-over-read-vlenb", "PreferVsetvliOverReadVLENB",
|
||||
|
||||
@ -568,7 +568,8 @@ uint64_t RISCVFrameLowering::getStackSizeWithRVVPadding(
|
||||
|
||||
static SmallVector<CalleeSavedInfo, 8>
|
||||
getUnmanagedCSI(const MachineFunction &MF,
|
||||
const std::vector<CalleeSavedInfo> &CSI) {
|
||||
const std::vector<CalleeSavedInfo> &CSI,
|
||||
bool ReverseOrder = false) {
|
||||
const MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
SmallVector<CalleeSavedInfo, 8> NonLibcallCSI;
|
||||
|
||||
@ -578,6 +579,11 @@ getUnmanagedCSI(const MachineFunction &MF,
|
||||
NonLibcallCSI.push_back(CS);
|
||||
}
|
||||
|
||||
// Reverse the order so that load/store operations use ascending addresses,
|
||||
// enabling better load/store clustering and fusion.
|
||||
if (ReverseOrder)
|
||||
std::reverse(NonLibcallCSI.begin(), NonLibcallCSI.end());
|
||||
|
||||
return NonLibcallCSI;
|
||||
}
|
||||
|
||||
@ -932,6 +938,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
|
||||
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
|
||||
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
|
||||
MachineBasicBlock::iterator MBBI = MBB.begin();
|
||||
bool PreferAscendingLS = STI.preferAscendingLoadStore();
|
||||
|
||||
Register BPReg = RISCVABI::getBPReg();
|
||||
|
||||
@ -967,8 +974,9 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
|
||||
// Skip to before the spills of scalar callee-saved registers
|
||||
// FIXME: assumes exactly one instruction is used to restore each
|
||||
// callee-saved register.
|
||||
MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() +
|
||||
getUnmanagedCSI(MF, CSI).size());
|
||||
MBBI =
|
||||
std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() +
|
||||
getUnmanagedCSI(MF, CSI, PreferAscendingLS).size());
|
||||
CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
|
||||
bool NeedsDwarfCFI = needsDwarfCFI(MF);
|
||||
|
||||
@ -1087,13 +1095,14 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
|
||||
// to the stack, not before.
|
||||
// FIXME: assumes exactly one instruction is used to save each callee-saved
|
||||
// register.
|
||||
std::advance(MBBI, getUnmanagedCSI(MF, CSI).size());
|
||||
std::advance(MBBI, getUnmanagedCSI(MF, CSI, PreferAscendingLS).size());
|
||||
CFIBuilder.setInsertPoint(MBBI);
|
||||
|
||||
// Iterate over list of callee-saved registers and emit .cfi_offset
|
||||
// directives.
|
||||
if (NeedsDwarfCFI) {
|
||||
for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) {
|
||||
for (const CalleeSavedInfo &CS :
|
||||
getUnmanagedCSI(MF, CSI, PreferAscendingLS)) {
|
||||
MCRegister Reg = CS.getReg();
|
||||
int64_t Offset = MFI.getObjectOffset(CS.getFrameIdx());
|
||||
// Emit CFI for both sub-registers. The even register is at the base
|
||||
@ -1242,6 +1251,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
|
||||
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
|
||||
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
|
||||
bool PreferAscendingLS = STI.preferAscendingLoadStore();
|
||||
|
||||
// All calls are tail calls in GHC calling conv, and functions have no
|
||||
// prologue/epilogue.
|
||||
@ -1342,7 +1352,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
|
||||
// Skip to after the restores of scalar callee-saved registers
|
||||
// FIXME: assumes exactly one instruction is used to restore each
|
||||
// callee-saved register.
|
||||
MBBI = std::next(FirstScalarCSRRestoreInsn, getUnmanagedCSI(MF, CSI).size());
|
||||
MBBI = std::next(FirstScalarCSRRestoreInsn,
|
||||
getUnmanagedCSI(MF, CSI, PreferAscendingLS).size());
|
||||
CFIBuilder.setInsertPoint(MBBI);
|
||||
|
||||
if (getLibCallID(MF, CSI) != -1) {
|
||||
@ -1360,7 +1371,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
|
||||
|
||||
// Recover callee-saved registers.
|
||||
if (NeedsDwarfCFI) {
|
||||
for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) {
|
||||
for (const CalleeSavedInfo &CS :
|
||||
getUnmanagedCSI(MF, CSI, PreferAscendingLS)) {
|
||||
MCRegister Reg = CS.getReg();
|
||||
// Emit CFI for both sub-registers.
|
||||
if (RISCV::GPRPairRegClass.contains(Reg)) {
|
||||
@ -1431,7 +1443,8 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
|
||||
// Callee-saved registers should be referenced relative to the stack
|
||||
// pointer (positive offset), otherwise use the frame pointer (negative
|
||||
// offset).
|
||||
const auto &CSI = getUnmanagedCSI(MF, MFI.getCalleeSavedInfo());
|
||||
const auto &CSI = getUnmanagedCSI(MF, MFI.getCalleeSavedInfo(),
|
||||
STI.preferAscendingLoadStore());
|
||||
int MinCSFI = 0;
|
||||
int MaxCSFI = -1;
|
||||
StackOffset Offset;
|
||||
@ -1451,8 +1464,8 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
|
||||
uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
|
||||
|
||||
if (CSI.size()) {
|
||||
MinCSFI = CSI[0].getFrameIdx();
|
||||
MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
|
||||
MinCSFI = std::min(CSI.front().getFrameIdx(), CSI.back().getFrameIdx());
|
||||
MaxCSFI = std::max(CSI.front().getFrameIdx(), CSI.back().getFrameIdx());
|
||||
}
|
||||
|
||||
if (FI >= MinCSFI && FI <= MaxCSFI) {
|
||||
@ -2259,7 +2272,8 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
|
||||
}
|
||||
|
||||
// Manually spill values not spilled by libcall & Push/Pop.
|
||||
const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI);
|
||||
const auto &UnmanagedCSI =
|
||||
getUnmanagedCSI(*MF, CSI, STI.preferAscendingLoadStore());
|
||||
const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, CSI);
|
||||
|
||||
auto storeRegsToStackSlots = [&](decltype(UnmanagedCSI) CSInfo) {
|
||||
@ -2352,7 +2366,8 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters(
|
||||
// the opportunity to avoid the load-to-use data hazard between
|
||||
// loading RA and return by RA. loadRegFromStackSlot can insert
|
||||
// multiple instructions.
|
||||
const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI);
|
||||
const auto &UnmanagedCSI =
|
||||
getUnmanagedCSI(*MF, CSI, STI.preferAscendingLoadStore());
|
||||
const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, CSI);
|
||||
|
||||
auto loadRegFromStackSlot = [&](decltype(UnmanagedCSI) CSInfo) {
|
||||
|
||||
@ -899,7 +899,8 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
|
||||
TuneOptimizedNF3SegmentLoadStore,
|
||||
TuneOptimizedNF4SegmentLoadStore,
|
||||
TuneEnableSelectOptimize,
|
||||
TuneVXRMPipelineFlush]> {
|
||||
TuneVXRMPipelineFlush,
|
||||
TunePreferAscendingLoadStore]> {
|
||||
let MVendorID = 0x710;
|
||||
let MArchID = 0x8000000058000001;
|
||||
let MImpID = 0x1000000049772200;
|
||||
|
||||
@ -78,6 +78,7 @@
|
||||
; CHECK-NEXT: optimized-zero-stride-load - Optimized (perform fewer memory operations)zero-stride vector load.
|
||||
; CHECK-NEXT: permissive-zalrsc - Implementation permits non-base instructions between LR/SC pairs.
|
||||
; CHECK-NEXT: predictable-select-expensive - Prefer likely predicted branches over selects.
|
||||
; CHECK-NEXT: prefer-ascending-load-store - Prefer ascending load/store order for better clustering and performance.
|
||||
; CHECK-NEXT: prefer-vsetvli-over-read-vlenb - Prefer vsetvli over read vlenb CSR to calculate VLEN.
|
||||
; CHECK-NEXT: prefer-w-inst - Prefer instructions with W suffix.
|
||||
; CHECK-NEXT: q - 'Q' (Quad-Precision Floating-Point).
|
||||
|
||||
@ -293,16 +293,16 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
|
||||
; RV64X60-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader
|
||||
; RV64X60-NEXT: addi sp, sp, -48
|
||||
; RV64X60-NEXT: .cfi_def_cfa_offset 48
|
||||
; RV64X60-NEXT: sd s0, 40(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s1, 32(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s2, 24(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s3, 16(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s4, 8(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: .cfi_offset s0, -8
|
||||
; RV64X60-NEXT: .cfi_offset s1, -16
|
||||
; RV64X60-NEXT: .cfi_offset s2, -24
|
||||
; RV64X60-NEXT: .cfi_offset s3, -32
|
||||
; RV64X60-NEXT: sd s3, 16(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s2, 24(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s1, 32(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: sd s0, 40(sp) # 8-byte Folded Spill
|
||||
; RV64X60-NEXT: .cfi_offset s4, -40
|
||||
; RV64X60-NEXT: .cfi_offset s3, -32
|
||||
; RV64X60-NEXT: .cfi_offset s2, -24
|
||||
; RV64X60-NEXT: .cfi_offset s1, -16
|
||||
; RV64X60-NEXT: .cfi_offset s0, -8
|
||||
; RV64X60-NEXT: li t0, 0
|
||||
; RV64X60-NEXT: li t1, 0
|
||||
; RV64X60-NEXT: addi s1, a7, -1
|
||||
@ -401,16 +401,16 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
|
||||
; RV64X60-NEXT: bne s0, s2, .LBB0_10
|
||||
; RV64X60-NEXT: j .LBB0_3
|
||||
; RV64X60-NEXT: .LBB0_11:
|
||||
; RV64X60-NEXT: ld s0, 40(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s1, 32(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s2, 24(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s3, 16(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s4, 8(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: .cfi_restore s0
|
||||
; RV64X60-NEXT: .cfi_restore s1
|
||||
; RV64X60-NEXT: .cfi_restore s2
|
||||
; RV64X60-NEXT: .cfi_restore s3
|
||||
; RV64X60-NEXT: ld s3, 16(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s2, 24(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s1, 32(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: ld s0, 40(sp) # 8-byte Folded Reload
|
||||
; RV64X60-NEXT: .cfi_restore s4
|
||||
; RV64X60-NEXT: .cfi_restore s3
|
||||
; RV64X60-NEXT: .cfi_restore s2
|
||||
; RV64X60-NEXT: .cfi_restore s1
|
||||
; RV64X60-NEXT: .cfi_restore s0
|
||||
; RV64X60-NEXT: addi sp, sp, 48
|
||||
; RV64X60-NEXT: .cfi_def_cfa_offset 0
|
||||
; RV64X60-NEXT: .LBB0_12: # %for.cond.cleanup
|
||||
|
||||
58
llvm/test/CodeGen/RISCV/spacemitx60-stack-reorder.ll
Normal file
58
llvm/test/CodeGen/RISCV/spacemitx60-stack-reorder.ll
Normal file
@ -0,0 +1,58 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
||||
; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
|
||||
; RUN: | FileCheck -check-prefix=NOT-X60 %s
|
||||
; RUN: llc -mtriple=riscv64 -mcpu=spacemit-x60 -verify-machineinstrs < %s \
|
||||
; RUN: | FileCheck -check-prefix=X60 %s
|
||||
|
||||
declare void @callee(ptr)
|
||||
|
||||
define void @test_stack_sort() {
|
||||
; NOT-X60-LABEL: test_stack_sort:
|
||||
; NOT-X60: # %bb.0:
|
||||
; NOT-X60-NEXT: addi sp, sp, -32
|
||||
; NOT-X60-NEXT: .cfi_def_cfa_offset 32
|
||||
; NOT-X60-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
|
||||
; NOT-X60-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
|
||||
; NOT-X60-NEXT: .cfi_offset ra, -8
|
||||
; NOT-X60-NEXT: .cfi_offset s0, -16
|
||||
; NOT-X60-NEXT: addi s0, sp, 32
|
||||
; NOT-X60-NEXT: .cfi_def_cfa s0, 0
|
||||
; NOT-X60-NEXT: andi sp, sp, -32
|
||||
; NOT-X60-NEXT: mv a0, sp
|
||||
; NOT-X60-NEXT: call callee
|
||||
; NOT-X60-NEXT: addi sp, s0, -32
|
||||
; NOT-X60-NEXT: .cfi_def_cfa sp, 32
|
||||
; NOT-X60-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
|
||||
; NOT-X60-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
|
||||
; NOT-X60-NEXT: .cfi_restore ra
|
||||
; NOT-X60-NEXT: .cfi_restore s0
|
||||
; NOT-X60-NEXT: addi sp, sp, 32
|
||||
; NOT-X60-NEXT: .cfi_def_cfa_offset 0
|
||||
; NOT-X60-NEXT: ret
|
||||
;
|
||||
; X60-LABEL: test_stack_sort:
|
||||
; X60: # %bb.0:
|
||||
; X60-NEXT: addi sp, sp, -32
|
||||
; X60-NEXT: .cfi_def_cfa_offset 32
|
||||
; X60-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
|
||||
; X60-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
|
||||
; X60-NEXT: .cfi_offset s0, -16
|
||||
; X60-NEXT: .cfi_offset ra, -8
|
||||
; X60-NEXT: addi s0, sp, 32
|
||||
; X60-NEXT: .cfi_def_cfa s0, 0
|
||||
; X60-NEXT: andi sp, sp, -32
|
||||
; X60-NEXT: mv a0, sp
|
||||
; X60-NEXT: call callee
|
||||
; X60-NEXT: addi sp, s0, -32
|
||||
; X60-NEXT: .cfi_def_cfa sp, 32
|
||||
; X60-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
|
||||
; X60-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
|
||||
; X60-NEXT: .cfi_restore s0
|
||||
; X60-NEXT: .cfi_restore ra
|
||||
; X60-NEXT: addi sp, sp, 32
|
||||
; X60-NEXT: .cfi_def_cfa_offset 0
|
||||
; X60-NEXT: ret
|
||||
%1 = alloca i8, align 32
|
||||
call void @callee(ptr %1)
|
||||
ret void
|
||||
}
|
||||
@ -44,18 +44,27 @@ TEST(RISCVTuneFeature, AllTuneFeatures) {
|
||||
RISCV::getAllTuneFeatures(AllTuneFeatures);
|
||||
// Only allowed subtarget features that are explicitly marked by
|
||||
// special TableGen class.
|
||||
EXPECT_EQ(AllTuneFeatures.size(), 19U);
|
||||
for (auto F :
|
||||
{"conditional-cmv-fusion", "disable-latency-sched-heuristic",
|
||||
"disable-misched-load-clustering", "disable-misched-store-clustering",
|
||||
"disable-postmisched-load-clustering",
|
||||
"disable-postmisched-store-clustering", "single-element-vec-fp64",
|
||||
"no-default-unroll", "no-sink-splat-operands", "use-postra-scheduler",
|
||||
"predictable-select-expensive", "prefer-vsetvli-over-read-vlenb",
|
||||
"prefer-w-inst", "short-forward-branch-ialu",
|
||||
"short-forward-branch-iminmax", "short-forward-branch-imul",
|
||||
"short-forward-branch-iload", "vl-dependent-latency",
|
||||
"vxrm-pipeline-flush"})
|
||||
EXPECT_EQ(AllTuneFeatures.size(), 20U);
|
||||
for (auto F : {"conditional-cmv-fusion",
|
||||
"disable-latency-sched-heuristic",
|
||||
"disable-misched-load-clustering",
|
||||
"disable-misched-store-clustering",
|
||||
"disable-postmisched-load-clustering",
|
||||
"disable-postmisched-store-clustering",
|
||||
"single-element-vec-fp64",
|
||||
"no-default-unroll",
|
||||
"no-sink-splat-operands",
|
||||
"use-postra-scheduler",
|
||||
"predictable-select-expensive",
|
||||
"prefer-ascending-load-store",
|
||||
"prefer-vsetvli-over-read-vlenb",
|
||||
"prefer-w-inst",
|
||||
"short-forward-branch-ialu",
|
||||
"short-forward-branch-iminmax",
|
||||
"short-forward-branch-imul",
|
||||
"short-forward-branch-iload",
|
||||
"vl-dependent-latency",
|
||||
"vxrm-pipeline-flush"})
|
||||
EXPECT_TRUE(is_contained(AllTuneFeatures, F));
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user