[CodeGen] Consider imm offsets when sorting framerefs (#171012)
LocalStackSlotAllocation pass disallows negative offsets with respect to a base register. The pass ends up introducing a new register for such frame references. This patch helps LocalStackSlotAlloca to additionally consider the immediate offset of an instruction, when sorting frame refs - hence, avoiding negative offsets and maximizing reuse of the existing registers.
This commit is contained in:
parent
51f6c58793
commit
03ad3d2e8f
@ -51,6 +51,7 @@ namespace {
|
||||
class FrameRef {
|
||||
MachineBasicBlock::iterator MI; // Instr referencing the frame
|
||||
int64_t LocalOffset; // Local offset of the frame idx referenced
|
||||
int64_t InstrOffset; // Offset of the instruction from the frame index
|
||||
int FrameIdx; // The frame index
|
||||
|
||||
// Order reference instruction appears in program. Used to ensure
|
||||
@ -59,16 +60,20 @@ namespace {
|
||||
unsigned Order;
|
||||
|
||||
public:
|
||||
FrameRef(MachineInstr *I, int64_t Offset, int Idx, unsigned Ord) :
|
||||
MI(I), LocalOffset(Offset), FrameIdx(Idx), Order(Ord) {}
|
||||
FrameRef(MachineInstr *I, int64_t Offset, int64_t InstrOffset, int Idx,
|
||||
unsigned Ord)
|
||||
: MI(I), LocalOffset(Offset), InstrOffset(InstrOffset), FrameIdx(Idx),
|
||||
Order(Ord) {}
|
||||
|
||||
bool operator<(const FrameRef &RHS) const {
|
||||
return std::tie(LocalOffset, FrameIdx, Order) <
|
||||
std::tie(RHS.LocalOffset, RHS.FrameIdx, RHS.Order);
|
||||
return std::tuple(LocalOffset + InstrOffset, FrameIdx, Order) <
|
||||
std::tuple(RHS.LocalOffset + RHS.InstrOffset, RHS.FrameIdx,
|
||||
RHS.Order);
|
||||
}
|
||||
|
||||
MachineBasicBlock::iterator getMachineInstr() const { return MI; }
|
||||
int64_t getLocalOffset() const { return LocalOffset; }
|
||||
int64_t getInstrOffset() const { return InstrOffset; }
|
||||
int getFrameIndex() const { return FrameIdx; }
|
||||
};
|
||||
|
||||
@ -335,20 +340,27 @@ bool LocalStackSlotImpl::insertFrameReferenceRegisters(MachineFunction &Fn) {
|
||||
// than that, but the increased register pressure makes that a
|
||||
// tricky thing to balance. Investigate if re-materializing these
|
||||
// becomes an issue.
|
||||
for (const MachineOperand &MO : MI.operands()) {
|
||||
for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx != OpEnd;
|
||||
++OpIdx) {
|
||||
const MachineOperand &MO = MI.getOperand(OpIdx);
|
||||
// Consider replacing all frame index operands that reference
|
||||
// an object allocated in the local block.
|
||||
if (MO.isFI()) {
|
||||
// Don't try this with values not in the local block.
|
||||
if (!MFI.isObjectPreAllocated(MO.getIndex()))
|
||||
break;
|
||||
int Idx = MO.getIndex();
|
||||
int64_t LocalOffset = LocalOffsets[Idx];
|
||||
if (!TRI->needsFrameBaseReg(&MI, LocalOffset))
|
||||
break;
|
||||
FrameReferenceInsns.push_back(FrameRef(&MI, LocalOffset, Idx, Order++));
|
||||
if (!MO.isFI())
|
||||
continue;
|
||||
|
||||
int FrameIdx = MO.getIndex();
|
||||
// Don't try this with values not in the local block.
|
||||
if (!MFI.isObjectPreAllocated(FrameIdx))
|
||||
break;
|
||||
}
|
||||
|
||||
int64_t LocalOffset = LocalOffsets[FrameIdx];
|
||||
if (!TRI->needsFrameBaseReg(&MI, LocalOffset))
|
||||
break;
|
||||
|
||||
int64_t InstrOffset = TRI->getFrameIndexInstrOffset(&MI, OpIdx);
|
||||
FrameReferenceInsns.emplace_back(&MI, LocalOffset, InstrOffset,
|
||||
FrameIdx, Order++);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -61,13 +61,11 @@ define amdgpu_kernel void @issue155902(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3
|
||||
; GFX950-NEXT: s_nop 0
|
||||
; GFX950-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x188
|
||||
; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0
|
||||
; GFX950-NEXT: v_mov_b32_e32 v3, 0x4008
|
||||
; GFX950-NEXT: buffer_wbl2 sc0 sc1
|
||||
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX950-NEXT: scratch_store_dwordx2 v3, v[0:1], off sc0 sc1
|
||||
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:8
|
||||
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s33
|
||||
; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0x384
|
||||
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s33 offset:16
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
|
||||
; GFX950-NEXT: v_readlane_b32 s0, v2, 0
|
||||
; GFX950-NEXT: s_nop 4
|
||||
@ -295,8 +293,7 @@ define amdgpu_kernel void @issue155902_fp(i64 %arg, i64 %arg1, i64 %arg2, i64 %a
|
||||
; GFX950-NEXT: s_nop 0
|
||||
; GFX950-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x188
|
||||
; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0
|
||||
; GFX950-NEXT: s_add_i32 s1, s33, 0x4008
|
||||
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s1 offset:8
|
||||
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:8
|
||||
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0
|
||||
; GFX950-NEXT: v_mov_b64_e32 v[0:1], 0x384
|
||||
; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 offset:16
|
||||
|
||||
@ -49,15 +49,15 @@ machineFunctionInfo:
|
||||
body: |
|
||||
bb.0:
|
||||
; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
|
||||
; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
|
||||
; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 100
|
||||
; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
|
||||
; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
|
||||
; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 156, [[V_ADD_U32_e64_]], 0, implicit $exec
|
||||
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
|
||||
; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 412, [[V_ADD_U32_e64_]], 0, implicit $exec
|
||||
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
|
||||
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
|
||||
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[COPY]]
|
||||
; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec
|
||||
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
|
||||
; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec
|
||||
; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1245193 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
|
||||
; GFX10-NEXT: SI_RETURN
|
||||
;
|
||||
; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
|
||||
|
||||
@ -294,12 +294,13 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
|
||||
; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
|
||||
; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
|
||||
; FLATSCR-NEXT: v_mov_b32_e32 v0, 0
|
||||
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024
|
||||
; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
|
||||
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_mov_b32 s0, 0
|
||||
; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop
|
||||
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000
|
||||
; FLATSCR-NEXT: s_add_i32 s1, s0, 0x4000
|
||||
; FLATSCR-NEXT: s_add_i32 s0, s0, 1
|
||||
; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
|
||||
; FLATSCR-NEXT: scratch_store_byte off, v0, s1
|
||||
@ -307,12 +308,12 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
|
||||
; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1
|
||||
; FLATSCR-NEXT: ; %bb.2: ; %split
|
||||
; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
|
||||
; FLATSCR-NEXT: s_addk_i32 s0, 0x2000
|
||||
; FLATSCR-NEXT: s_addk_i32 s0, 0x4000
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
|
||||
; FLATSCR-NEXT: s_movk_i32 s0, 0x4000
|
||||
; FLATSCR-NEXT: scratch_load_dwordx2 v[10:11], off, s0 offset:16 glc
|
||||
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
||||
; FLATSCR-NEXT: scratch_load_dwordx4 v[4:7], off, s0 glc
|
||||
|
||||
@ -13,7 +13,7 @@ body: |
|
||||
; CHECK-LABEL: name: issue155902
|
||||
; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.1
|
||||
; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
|
||||
; CHECK-NEXT: SCRATCH_STORE_DWORDX2_SADDR [[V_MOV_B]], %stack.1, 8, 0, implicit $exec, implicit $flat_scr
|
||||
; CHECK-NEXT: SCRATCH_STORE_DWORDX2_SADDR [[V_MOV_B]], [[S_MOV_B32_]], 8, 0, implicit $exec, implicit $flat_scr
|
||||
; CHECK-NEXT: SCRATCH_STORE_DWORDX2_SADDR [[V_MOV_B]], [[S_MOV_B32_]], 0, 0, implicit $exec, implicit $flat_scr
|
||||
; CHECK-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 900, implicit $exec
|
||||
; CHECK-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed [[V_MOV_B1]], [[S_MOV_B32_]], 16, 0, implicit $exec, implicit $flat_scr
|
||||
|
||||
@ -150,8 +150,7 @@ define dso_local void @required_fp(i32 %0, i32 %1) {
|
||||
; FP-NEXT: subs r1, r3, r1
|
||||
; FP-NEXT: mov sp, r1
|
||||
; FP-NEXT: movs r1, #0
|
||||
; FP-NEXT: str r1, [r6, #4]
|
||||
; FP-NEXT: str r0, [r2]
|
||||
; FP-NEXT: stm r2!, {r0, r1}
|
||||
; FP-NEXT: subs r6, r7, #7
|
||||
; FP-NEXT: subs r6, #1
|
||||
; FP-NEXT: mov sp, r6
|
||||
@ -184,8 +183,7 @@ define dso_local void @required_fp(i32 %0, i32 %1) {
|
||||
; FP-AAPCS-NEXT: subs r1, r3, r1
|
||||
; FP-AAPCS-NEXT: mov sp, r1
|
||||
; FP-AAPCS-NEXT: movs r1, #0
|
||||
; FP-AAPCS-NEXT: str r1, [r6, #4]
|
||||
; FP-AAPCS-NEXT: str r0, [r2]
|
||||
; FP-AAPCS-NEXT: stm r2!, {r0, r1}
|
||||
; FP-AAPCS-NEXT: mov r6, r11
|
||||
; FP-AAPCS-NEXT: subs r6, #8
|
||||
; FP-AAPCS-NEXT: mov sp, r6
|
||||
@ -216,8 +214,7 @@ define dso_local void @required_fp(i32 %0, i32 %1) {
|
||||
; NOFP-NEXT: subs r1, r3, r1
|
||||
; NOFP-NEXT: mov sp, r1
|
||||
; NOFP-NEXT: movs r1, #0
|
||||
; NOFP-NEXT: str r1, [r6, #4]
|
||||
; NOFP-NEXT: str r0, [r2]
|
||||
; NOFP-NEXT: stm r2!, {r0, r1}
|
||||
; NOFP-NEXT: subs r6, r7, #7
|
||||
; NOFP-NEXT: subs r6, #1
|
||||
; NOFP-NEXT: mov sp, r6
|
||||
@ -250,8 +247,7 @@ define dso_local void @required_fp(i32 %0, i32 %1) {
|
||||
; NOFP-AAPCS-NEXT: subs r1, r3, r1
|
||||
; NOFP-AAPCS-NEXT: mov sp, r1
|
||||
; NOFP-AAPCS-NEXT: movs r1, #0
|
||||
; NOFP-AAPCS-NEXT: str r1, [r6, #4]
|
||||
; NOFP-AAPCS-NEXT: str r0, [r2]
|
||||
; NOFP-AAPCS-NEXT: stm r2!, {r0, r1}
|
||||
; NOFP-AAPCS-NEXT: mov r6, r11
|
||||
; NOFP-AAPCS-NEXT: subs r6, #8
|
||||
; NOFP-AAPCS-NEXT: mov sp, r6
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user