diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index 987f64f56403..d316f8d804f5 100644 --- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -51,6 +51,7 @@ namespace { class FrameRef { MachineBasicBlock::iterator MI; // Instr referencing the frame int64_t LocalOffset; // Local offset of the frame idx referenced + int64_t InstrOffset; // Offset of the instruction from the frame index int FrameIdx; // The frame index // Order reference instruction appears in program. Used to ensure @@ -59,16 +60,20 @@ namespace { unsigned Order; public: - FrameRef(MachineInstr *I, int64_t Offset, int Idx, unsigned Ord) : - MI(I), LocalOffset(Offset), FrameIdx(Idx), Order(Ord) {} + FrameRef(MachineInstr *I, int64_t Offset, int64_t InstrOffset, int Idx, + unsigned Ord) + : MI(I), LocalOffset(Offset), InstrOffset(InstrOffset), FrameIdx(Idx), + Order(Ord) {} bool operator<(const FrameRef &RHS) const { - return std::tie(LocalOffset, FrameIdx, Order) < - std::tie(RHS.LocalOffset, RHS.FrameIdx, RHS.Order); + return std::tuple(LocalOffset + InstrOffset, FrameIdx, Order) < + std::tuple(RHS.LocalOffset + RHS.InstrOffset, RHS.FrameIdx, + RHS.Order); } MachineBasicBlock::iterator getMachineInstr() const { return MI; } int64_t getLocalOffset() const { return LocalOffset; } + int64_t getInstrOffset() const { return InstrOffset; } int getFrameIndex() const { return FrameIdx; } }; @@ -335,20 +340,27 @@ bool LocalStackSlotImpl::insertFrameReferenceRegisters(MachineFunction &Fn) { // than that, but the increased register pressure makes that a // tricky thing to balance. Investigate if re-materializing these // becomes an issue. - for (const MachineOperand &MO : MI.operands()) { + for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx != OpEnd; + ++OpIdx) { + const MachineOperand &MO = MI.getOperand(OpIdx); // Consider replacing all frame index operands that reference // an object allocated in the local block. - if (MO.isFI()) { - // Don't try this with values not in the local block. - if (!MFI.isObjectPreAllocated(MO.getIndex())) - break; - int Idx = MO.getIndex(); - int64_t LocalOffset = LocalOffsets[Idx]; - if (!TRI->needsFrameBaseReg(&MI, LocalOffset)) - break; - FrameReferenceInsns.push_back(FrameRef(&MI, LocalOffset, Idx, Order++)); + if (!MO.isFI()) + continue; + + int FrameIdx = MO.getIndex(); + // Don't try this with values not in the local block. + if (!MFI.isObjectPreAllocated(FrameIdx)) break; - } + + int64_t LocalOffset = LocalOffsets[FrameIdx]; + if (!TRI->needsFrameBaseReg(&MI, LocalOffset)) + break; + + int64_t InstrOffset = TRI->getFrameIndexInstrOffset(&MI, OpIdx); + FrameReferenceInsns.emplace_back(&MI, LocalOffset, InstrOffset, + FrameIdx, Order++); + break; } } } diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll index 26acb4604cbc..122b75acf400 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll @@ -61,13 +61,11 @@ define amdgpu_kernel void @issue155902(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3 ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x188 ; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0 -; GFX950-NEXT: v_mov_b32_e32 v3, 0x4008 -; GFX950-NEXT: buffer_wbl2 sc0 sc1 -; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX950-NEXT: scratch_store_dwordx2 v3, v[0:1], off sc0 sc1 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:8 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s33 ; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0x384 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:16 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-NEXT: v_readlane_b32 s0, v2, 0 ; GFX950-NEXT: s_nop 4 @@ -295,8 +293,7 @@ define amdgpu_kernel void @issue155902_fp(i64 %arg, i64 %arg1, i64 %arg2, i64 %a ; GFX950-NEXT: s_nop 0 ; GFX950-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x188 ; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0 -; GFX950-NEXT: s_add_i32 s1, s33, 0x4008 -; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s1 offset:8 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:8 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0x384 ; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir index 8ea9ec397fe0..3be645621316 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir @@ -49,15 +49,15 @@ machineFunctionInfo: body: | bb.0: ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute - ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256 + ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 100 ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 156, [[V_ADD_U32_e64_]], 0, implicit $exec + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] + ; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 412, [[V_ADD_U32_e64_]], 0, implicit $exec + ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]] ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]] - ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]] - ; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec - ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]] ; GFX10-NEXT: SI_RETURN ; ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index 5f0ca7bc42ae..3d02d70d2fdb 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -294,12 +294,13 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024 +; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_mov_b32 s0, 0 ; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop ; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000 +; FLATSCR-NEXT: s_add_i32 s1, s0, 0x4000 ; FLATSCR-NEXT: s_add_i32 s0, s0, 1 ; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120 ; FLATSCR-NEXT: scratch_store_byte off, v0, s1 @@ -307,12 +308,12 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1 ; FLATSCR-NEXT: ; %bb.2: ; %split ; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 -; FLATSCR-NEXT: s_addk_i32 s0, 0x2000 +; FLATSCR-NEXT: s_addk_i32 s0, 0x4000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: s_movk_i32 s0, 0x4000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:16 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s0 glc diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-sort-framerefs.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-sort-framerefs.mir index 1364ad524824..b37d6d36b3f3 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-sort-framerefs.mir +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-sort-framerefs.mir @@ -13,7 +13,7 @@ body: | ; CHECK-LABEL: name: issue155902 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.1 ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec - ; CHECK-NEXT: SCRATCH_STORE_DWORDX2_SADDR [[V_MOV_B]], %stack.1, 8, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: SCRATCH_STORE_DWORDX2_SADDR [[V_MOV_B]], [[S_MOV_B32_]], 8, 0, implicit $exec, implicit $flat_scr ; CHECK-NEXT: SCRATCH_STORE_DWORDX2_SADDR [[V_MOV_B]], [[S_MOV_B32_]], 0, 0, implicit $exec, implicit $flat_scr ; CHECK-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 900, implicit $exec ; CHECK-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed [[V_MOV_B1]], [[S_MOV_B32_]], 16, 0, implicit $exec, implicit $flat_scr diff --git a/llvm/test/CodeGen/Thumb/frame-chain.ll b/llvm/test/CodeGen/Thumb/frame-chain.ll index a680f2fa4a48..8dde3b5fed75 100644 --- a/llvm/test/CodeGen/Thumb/frame-chain.ll +++ b/llvm/test/CodeGen/Thumb/frame-chain.ll @@ -150,8 +150,7 @@ define dso_local void @required_fp(i32 %0, i32 %1) { ; FP-NEXT: subs r1, r3, r1 ; FP-NEXT: mov sp, r1 ; FP-NEXT: movs r1, #0 -; FP-NEXT: str r1, [r6, #4] -; FP-NEXT: str r0, [r2] +; FP-NEXT: stm r2!, {r0, r1} ; FP-NEXT: subs r6, r7, #7 ; FP-NEXT: subs r6, #1 ; FP-NEXT: mov sp, r6 @@ -184,8 +183,7 @@ define dso_local void @required_fp(i32 %0, i32 %1) { ; FP-AAPCS-NEXT: subs r1, r3, r1 ; FP-AAPCS-NEXT: mov sp, r1 ; FP-AAPCS-NEXT: movs r1, #0 -; FP-AAPCS-NEXT: str r1, [r6, #4] -; FP-AAPCS-NEXT: str r0, [r2] +; FP-AAPCS-NEXT: stm r2!, {r0, r1} ; FP-AAPCS-NEXT: mov r6, r11 ; FP-AAPCS-NEXT: subs r6, #8 ; FP-AAPCS-NEXT: mov sp, r6 @@ -216,8 +214,7 @@ define dso_local void @required_fp(i32 %0, i32 %1) { ; NOFP-NEXT: subs r1, r3, r1 ; NOFP-NEXT: mov sp, r1 ; NOFP-NEXT: movs r1, #0 -; NOFP-NEXT: str r1, [r6, #4] -; NOFP-NEXT: str r0, [r2] +; NOFP-NEXT: stm r2!, {r0, r1} ; NOFP-NEXT: subs r6, r7, #7 ; NOFP-NEXT: subs r6, #1 ; NOFP-NEXT: mov sp, r6 @@ -250,8 +247,7 @@ define dso_local void @required_fp(i32 %0, i32 %1) { ; NOFP-AAPCS-NEXT: subs r1, r3, r1 ; NOFP-AAPCS-NEXT: mov sp, r1 ; NOFP-AAPCS-NEXT: movs r1, #0 -; NOFP-AAPCS-NEXT: str r1, [r6, #4] -; NOFP-AAPCS-NEXT: str r0, [r2] +; NOFP-AAPCS-NEXT: stm r2!, {r0, r1} ; NOFP-AAPCS-NEXT: mov r6, r11 ; NOFP-AAPCS-NEXT: subs r6, #8 ; NOFP-AAPCS-NEXT: mov sp, r6