[AMDGPU] Prioritize allocation of low 256 VGPR classes (#167978)
If we have 1024 VGPRs available we need to give priority to the allocation of these registers where operands can only use low 256. That is noteably scale operands of V_WMMA_SCALE instructions. Otherwise large tuples will be allocated first and take all low registers, so we would have to spill to get a room for these scale registers. Allocation priority itself does not eliminate spilling completely in large kernels, although helps to some degree. Increasing spill weight of a restricted class on top of it helps.
This commit is contained in:
parent
03f4d4d492
commit
82380f33de
@ -496,6 +496,17 @@ public:
|
||||
|
||||
SmallVector<StringLiteral>
|
||||
getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
|
||||
|
||||
float
|
||||
getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override {
|
||||
// Prioritize VGPR_32_Lo256 over other classes which may occupy registers
|
||||
// beyond v256.
|
||||
return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) *
|
||||
((RC == &AMDGPU::VGPR_32_Lo256RegClass ||
|
||||
RC == &AMDGPU::VReg_64_Lo256_Align2RegClass)
|
||||
? 2.0
|
||||
: 1.0);
|
||||
}
|
||||
};
|
||||
|
||||
namespace AMDGPU {
|
||||
|
||||
@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1
|
||||
// Identical to VGPR_32 except it only contains the low 256 (Lo256) registers.
|
||||
def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
|
||||
(add (sequence "VGPR%u", 0, 255))> {
|
||||
let AllocationPriority = 0;
|
||||
let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor));
|
||||
let GeneratePressureSet = 0;
|
||||
let Size = 32;
|
||||
let Weight = 1;
|
||||
|
||||
@ -1,9 +1,8 @@
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s
|
||||
|
||||
; FIXME: Scale operands of WMMA are limited to low 256 VGPRs
|
||||
; currently we are spilling it because all low VGPRs are occupied even though our budget is higher.
|
||||
; Scale operands of WMMA are limited to low 256 VGPRs
|
||||
; Make sure we do not spill scale operands because of the low 256 restriction.
|
||||
; CHECK: ; ScratchSize: 12
|
||||
; CHECK: ; ScratchSize: 0
|
||||
; CHECK: ; Occupancy: 1
|
||||
|
||||
define amdgpu_kernel void @spill_scale_test(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <16 x i32> %arg8, float %arg9, <16 x i32> %arg10, float %arg11, <16 x i8> %arg12) #0 {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user