diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 78abee4bdf15..eb19f80dc7ed 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -2031,6 +2031,12 @@ def TuneVXRMPipelineFlush : RISCVSimpleTuneFeature<"vxrm-pipeline-flush", "HasVXRMPipelineFlush", "true", "VXRM writes causes pipeline flush">; +def TunePreferAscendingLoadStore + : RISCVSimpleTuneFeature<"prefer-ascending-load-store", + "PreferAscendingLoadStore", "true", + "Prefer ascending load/store order for better " + "clustering and performance">; + def TunePreferVsetvliOverReadVLENB : RISCVSimpleTuneFeature< "prefer-vsetvli-over-read-vlenb", "PreferVsetvliOverReadVLENB", diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index b7856b2805cb..960fd0757677 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -568,7 +568,8 @@ uint64_t RISCVFrameLowering::getStackSizeWithRVVPadding( static SmallVector getUnmanagedCSI(const MachineFunction &MF, - const std::vector &CSI) { + const std::vector &CSI, + bool ReverseOrder = false) { const MachineFrameInfo &MFI = MF.getFrameInfo(); SmallVector NonLibcallCSI; @@ -578,6 +579,11 @@ getUnmanagedCSI(const MachineFunction &MF, NonLibcallCSI.push_back(CS); } + // Reverse the order so that load/store operations use ascending addresses, + // enabling better load/store clustering and fusion. + if (ReverseOrder) + std::reverse(NonLibcallCSI.begin(), NonLibcallCSI.end()); + return NonLibcallCSI; } @@ -932,6 +938,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, auto *RVFI = MF.getInfo(); const RISCVRegisterInfo *RI = STI.getRegisterInfo(); MachineBasicBlock::iterator MBBI = MBB.begin(); + bool PreferAscendingLS = STI.preferAscendingLoadStore(); Register BPReg = RISCVABI::getBPReg(); @@ -967,8 +974,9 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // Skip to before the spills of scalar callee-saved registers // FIXME: assumes exactly one instruction is used to restore each // callee-saved register. - MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() + - getUnmanagedCSI(MF, CSI).size()); + MBBI = + std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() + + getUnmanagedCSI(MF, CSI, PreferAscendingLS).size()); CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); bool NeedsDwarfCFI = needsDwarfCFI(MF); @@ -1087,13 +1095,14 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // to the stack, not before. // FIXME: assumes exactly one instruction is used to save each callee-saved // register. - std::advance(MBBI, getUnmanagedCSI(MF, CSI).size()); + std::advance(MBBI, getUnmanagedCSI(MF, CSI, PreferAscendingLS).size()); CFIBuilder.setInsertPoint(MBBI); // Iterate over list of callee-saved registers and emit .cfi_offset // directives. if (NeedsDwarfCFI) { - for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) { + for (const CalleeSavedInfo &CS : + getUnmanagedCSI(MF, CSI, PreferAscendingLS)) { MCRegister Reg = CS.getReg(); int64_t Offset = MFI.getObjectOffset(CS.getFrameIdx()); // Emit CFI for both sub-registers. The even register is at the base @@ -1242,6 +1251,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, const RISCVRegisterInfo *RI = STI.getRegisterInfo(); MachineFrameInfo &MFI = MF.getFrameInfo(); auto *RVFI = MF.getInfo(); + bool PreferAscendingLS = STI.preferAscendingLoadStore(); // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. @@ -1342,7 +1352,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, // Skip to after the restores of scalar callee-saved registers // FIXME: assumes exactly one instruction is used to restore each // callee-saved register. - MBBI = std::next(FirstScalarCSRRestoreInsn, getUnmanagedCSI(MF, CSI).size()); + MBBI = std::next(FirstScalarCSRRestoreInsn, + getUnmanagedCSI(MF, CSI, PreferAscendingLS).size()); CFIBuilder.setInsertPoint(MBBI); if (getLibCallID(MF, CSI) != -1) { @@ -1360,7 +1371,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, // Recover callee-saved registers. if (NeedsDwarfCFI) { - for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) { + for (const CalleeSavedInfo &CS : + getUnmanagedCSI(MF, CSI, PreferAscendingLS)) { MCRegister Reg = CS.getReg(); // Emit CFI for both sub-registers. if (RISCV::GPRPairRegClass.contains(Reg)) { @@ -1431,7 +1443,8 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, // Callee-saved registers should be referenced relative to the stack // pointer (positive offset), otherwise use the frame pointer (negative // offset). - const auto &CSI = getUnmanagedCSI(MF, MFI.getCalleeSavedInfo()); + const auto &CSI = getUnmanagedCSI(MF, MFI.getCalleeSavedInfo(), + STI.preferAscendingLoadStore()); int MinCSFI = 0; int MaxCSFI = -1; StackOffset Offset; @@ -1451,8 +1464,8 @@ RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); if (CSI.size()) { - MinCSFI = CSI[0].getFrameIdx(); - MaxCSFI = CSI[CSI.size() - 1].getFrameIdx(); + MinCSFI = std::min(CSI.front().getFrameIdx(), CSI.back().getFrameIdx()); + MaxCSFI = std::max(CSI.front().getFrameIdx(), CSI.back().getFrameIdx()); } if (FI >= MinCSFI && FI <= MaxCSFI) { @@ -2259,7 +2272,8 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters( } // Manually spill values not spilled by libcall & Push/Pop. - const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI); + const auto &UnmanagedCSI = + getUnmanagedCSI(*MF, CSI, STI.preferAscendingLoadStore()); const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, CSI); auto storeRegsToStackSlots = [&](decltype(UnmanagedCSI) CSInfo) { @@ -2352,7 +2366,8 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters( // the opportunity to avoid the load-to-use data hazard between // loading RA and return by RA. loadRegFromStackSlot can insert // multiple instructions. - const auto &UnmanagedCSI = getUnmanagedCSI(*MF, CSI); + const auto &UnmanagedCSI = + getUnmanagedCSI(*MF, CSI, STI.preferAscendingLoadStore()); const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, CSI); auto loadRegFromStackSlot = [&](decltype(UnmanagedCSI) CSInfo) { diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 448fad5224f9..44be2a5e9cc5 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -899,7 +899,8 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60", TuneOptimizedNF3SegmentLoadStore, TuneOptimizedNF4SegmentLoadStore, TuneEnableSelectOptimize, - TuneVXRMPipelineFlush]> { + TuneVXRMPipelineFlush, + TunePreferAscendingLoadStore]> { let MVendorID = 0x710; let MArchID = 0x8000000058000001; let MImpID = 0x1000000049772200; diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 9d875299776e..5a137a9dbbc3 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -78,6 +78,7 @@ ; CHECK-NEXT: optimized-zero-stride-load - Optimized (perform fewer memory operations)zero-stride vector load. ; CHECK-NEXT: permissive-zalrsc - Implementation permits non-base instructions between LR/SC pairs. ; CHECK-NEXT: predictable-select-expensive - Prefer likely predicted branches over selects. +; CHECK-NEXT: prefer-ascending-load-store - Prefer ascending load/store order for better clustering and performance. ; CHECK-NEXT: prefer-vsetvli-over-read-vlenb - Prefer vsetvli over read vlenb CSR to calculate VLEN. ; CHECK-NEXT: prefer-w-inst - Prefer instructions with W suffix. ; CHECK-NEXT: q - 'Q' (Quad-Precision Floating-Point). diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll index 633d5a35985e..854d76c2d854 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll @@ -293,16 +293,16 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV64X60-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader ; RV64X60-NEXT: addi sp, sp, -48 ; RV64X60-NEXT: .cfi_def_cfa_offset 48 -; RV64X60-NEXT: sd s0, 40(sp) # 8-byte Folded Spill -; RV64X60-NEXT: sd s1, 32(sp) # 8-byte Folded Spill -; RV64X60-NEXT: sd s2, 24(sp) # 8-byte Folded Spill -; RV64X60-NEXT: sd s3, 16(sp) # 8-byte Folded Spill ; RV64X60-NEXT: sd s4, 8(sp) # 8-byte Folded Spill -; RV64X60-NEXT: .cfi_offset s0, -8 -; RV64X60-NEXT: .cfi_offset s1, -16 -; RV64X60-NEXT: .cfi_offset s2, -24 -; RV64X60-NEXT: .cfi_offset s3, -32 +; RV64X60-NEXT: sd s3, 16(sp) # 8-byte Folded Spill +; RV64X60-NEXT: sd s2, 24(sp) # 8-byte Folded Spill +; RV64X60-NEXT: sd s1, 32(sp) # 8-byte Folded Spill +; RV64X60-NEXT: sd s0, 40(sp) # 8-byte Folded Spill ; RV64X60-NEXT: .cfi_offset s4, -40 +; RV64X60-NEXT: .cfi_offset s3, -32 +; RV64X60-NEXT: .cfi_offset s2, -24 +; RV64X60-NEXT: .cfi_offset s1, -16 +; RV64X60-NEXT: .cfi_offset s0, -8 ; RV64X60-NEXT: li t0, 0 ; RV64X60-NEXT: li t1, 0 ; RV64X60-NEXT: addi s1, a7, -1 @@ -401,16 +401,16 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV64X60-NEXT: bne s0, s2, .LBB0_10 ; RV64X60-NEXT: j .LBB0_3 ; RV64X60-NEXT: .LBB0_11: -; RV64X60-NEXT: ld s0, 40(sp) # 8-byte Folded Reload -; RV64X60-NEXT: ld s1, 32(sp) # 8-byte Folded Reload -; RV64X60-NEXT: ld s2, 24(sp) # 8-byte Folded Reload -; RV64X60-NEXT: ld s3, 16(sp) # 8-byte Folded Reload ; RV64X60-NEXT: ld s4, 8(sp) # 8-byte Folded Reload -; RV64X60-NEXT: .cfi_restore s0 -; RV64X60-NEXT: .cfi_restore s1 -; RV64X60-NEXT: .cfi_restore s2 -; RV64X60-NEXT: .cfi_restore s3 +; RV64X60-NEXT: ld s3, 16(sp) # 8-byte Folded Reload +; RV64X60-NEXT: ld s2, 24(sp) # 8-byte Folded Reload +; RV64X60-NEXT: ld s1, 32(sp) # 8-byte Folded Reload +; RV64X60-NEXT: ld s0, 40(sp) # 8-byte Folded Reload ; RV64X60-NEXT: .cfi_restore s4 +; RV64X60-NEXT: .cfi_restore s3 +; RV64X60-NEXT: .cfi_restore s2 +; RV64X60-NEXT: .cfi_restore s1 +; RV64X60-NEXT: .cfi_restore s0 ; RV64X60-NEXT: addi sp, sp, 48 ; RV64X60-NEXT: .cfi_def_cfa_offset 0 ; RV64X60-NEXT: .LBB0_12: # %for.cond.cleanup diff --git a/llvm/test/CodeGen/RISCV/spacemitx60-stack-reorder.ll b/llvm/test/CodeGen/RISCV/spacemitx60-stack-reorder.ll new file mode 100644 index 000000000000..239fcd11a3f9 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/spacemitx60-stack-reorder.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=NOT-X60 %s +; RUN: llc -mtriple=riscv64 -mcpu=spacemit-x60 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=X60 %s + +declare void @callee(ptr) + +define void @test_stack_sort() { +; NOT-X60-LABEL: test_stack_sort: +; NOT-X60: # %bb.0: +; NOT-X60-NEXT: addi sp, sp, -32 +; NOT-X60-NEXT: .cfi_def_cfa_offset 32 +; NOT-X60-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; NOT-X60-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; NOT-X60-NEXT: .cfi_offset ra, -8 +; NOT-X60-NEXT: .cfi_offset s0, -16 +; NOT-X60-NEXT: addi s0, sp, 32 +; NOT-X60-NEXT: .cfi_def_cfa s0, 0 +; NOT-X60-NEXT: andi sp, sp, -32 +; NOT-X60-NEXT: mv a0, sp +; NOT-X60-NEXT: call callee +; NOT-X60-NEXT: addi sp, s0, -32 +; NOT-X60-NEXT: .cfi_def_cfa sp, 32 +; NOT-X60-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; NOT-X60-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; NOT-X60-NEXT: .cfi_restore ra +; NOT-X60-NEXT: .cfi_restore s0 +; NOT-X60-NEXT: addi sp, sp, 32 +; NOT-X60-NEXT: .cfi_def_cfa_offset 0 +; NOT-X60-NEXT: ret +; +; X60-LABEL: test_stack_sort: +; X60: # %bb.0: +; X60-NEXT: addi sp, sp, -32 +; X60-NEXT: .cfi_def_cfa_offset 32 +; X60-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; X60-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; X60-NEXT: .cfi_offset s0, -16 +; X60-NEXT: .cfi_offset ra, -8 +; X60-NEXT: addi s0, sp, 32 +; X60-NEXT: .cfi_def_cfa s0, 0 +; X60-NEXT: andi sp, sp, -32 +; X60-NEXT: mv a0, sp +; X60-NEXT: call callee +; X60-NEXT: addi sp, s0, -32 +; X60-NEXT: .cfi_def_cfa sp, 32 +; X60-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; X60-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; X60-NEXT: .cfi_restore s0 +; X60-NEXT: .cfi_restore ra +; X60-NEXT: addi sp, sp, 32 +; X60-NEXT: .cfi_def_cfa_offset 0 +; X60-NEXT: ret + %1 = alloca i8, align 32 + call void @callee(ptr %1) + ret void +} diff --git a/llvm/unittests/TargetParser/RISCVTargetParserTest.cpp b/llvm/unittests/TargetParser/RISCVTargetParserTest.cpp index 41ef491a0445..0302d56e3e3f 100644 --- a/llvm/unittests/TargetParser/RISCVTargetParserTest.cpp +++ b/llvm/unittests/TargetParser/RISCVTargetParserTest.cpp @@ -44,18 +44,27 @@ TEST(RISCVTuneFeature, AllTuneFeatures) { RISCV::getAllTuneFeatures(AllTuneFeatures); // Only allowed subtarget features that are explicitly marked by // special TableGen class. - EXPECT_EQ(AllTuneFeatures.size(), 19U); - for (auto F : - {"conditional-cmv-fusion", "disable-latency-sched-heuristic", - "disable-misched-load-clustering", "disable-misched-store-clustering", - "disable-postmisched-load-clustering", - "disable-postmisched-store-clustering", "single-element-vec-fp64", - "no-default-unroll", "no-sink-splat-operands", "use-postra-scheduler", - "predictable-select-expensive", "prefer-vsetvli-over-read-vlenb", - "prefer-w-inst", "short-forward-branch-ialu", - "short-forward-branch-iminmax", "short-forward-branch-imul", - "short-forward-branch-iload", "vl-dependent-latency", - "vxrm-pipeline-flush"}) + EXPECT_EQ(AllTuneFeatures.size(), 20U); + for (auto F : {"conditional-cmv-fusion", + "disable-latency-sched-heuristic", + "disable-misched-load-clustering", + "disable-misched-store-clustering", + "disable-postmisched-load-clustering", + "disable-postmisched-store-clustering", + "single-element-vec-fp64", + "no-default-unroll", + "no-sink-splat-operands", + "use-postra-scheduler", + "predictable-select-expensive", + "prefer-ascending-load-store", + "prefer-vsetvli-over-read-vlenb", + "prefer-w-inst", + "short-forward-branch-ialu", + "short-forward-branch-iminmax", + "short-forward-branch-imul", + "short-forward-branch-iload", + "vl-dependent-latency", + "vxrm-pipeline-flush"}) EXPECT_TRUE(is_contained(AllTuneFeatures, F)); }