[AMDGPU] Prioritize allocation of low 256 VGPR classes (#167978)

If we have 1024 VGPRs available we need to give priority to the
allocation of these registers where operands can only use low 256.
That is noteably scale operands of V_WMMA_SCALE instructions.
Otherwise large tuples will be allocated first and take all low
registers, so we would have to spill to get a room for these
scale registers.

Allocation priority itself does not eliminate spilling completely
in large kernels, although helps to some degree. Increasing spill
weight of a restricted class on top of it helps.
This commit is contained in:
Stanislav Mekhanoshin 2025-11-19 16:00:46 -08:00 committed by GitHub
parent 03f4d4d492
commit 82380f33de
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 14 additions and 4 deletions

View File

@ -496,6 +496,17 @@ public:
SmallVector<StringLiteral>
getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
float
getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override {
// Prioritize VGPR_32_Lo256 over other classes which may occupy registers
// beyond v256.
return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) *
((RC == &AMDGPU::VGPR_32_Lo256RegClass ||
RC == &AMDGPU::VReg_64_Lo256_Align2RegClass)
? 2.0
: 1.0);
}
};
namespace AMDGPU {

View File

@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1
// Identical to VGPR_32 except it only contains the low 256 (Lo256) registers.
def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 255))> {
let AllocationPriority = 0;
let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor));
let GeneratePressureSet = 0;
let Size = 32;
let Weight = 1;

View File

@ -1,9 +1,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s
; FIXME: Scale operands of WMMA are limited to low 256 VGPRs
; currently we are spilling it because all low VGPRs are occupied even though our budget is higher.
; Scale operands of WMMA are limited to low 256 VGPRs
; Make sure we do not spill scale operands because of the low 256 restriction.
; CHECK: ; ScratchSize: 12
; CHECK: ; ScratchSize: 0
; CHECK: ; Occupancy: 1
define amdgpu_kernel void @spill_scale_test(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <16 x i32> %arg8, float %arg9, <16 x i32> %arg10, float %arg11, <16 x i8> %arg12) #0 {