[AMDGPU] Prioritize allocation of low 256 VGPR classes (#167978)

If we have 1024 VGPRs available we need to give priority to the allocation of these registers where operands can only use low 256. That is noteably scale operands of V_WMMA_SCALE instructions. Otherwise large tuples will be allocated first and take all low registers, so we would have to spill to get a room for these scale registers. Allocation priority itself does not eliminate spilling completely in large kernels, although helps to some degree. Increasing spill weight of a restricted class on top of it helps.
2025-11-19 16:00:46 -08:00 · 2025-11-19 16:00:46 -08:00 · 82380f33de
commit 82380f33de
parent 03f4d4d492
3 changed files with 14 additions and 4 deletions
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@ -496,6 +496,17 @@ public:

  SmallVector<StringLiteral>
  getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
+
+  float
+  getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override {
+    // Prioritize VGPR_32_Lo256 over other classes which may occupy registers
+    // beyond v256.
+    return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) *
+           ((RC == &AMDGPU::VGPR_32_Lo256RegClass ||
+             RC == &AMDGPU::VReg_64_Lo256_Align2RegClass)
+                ? 2.0
+                : 1.0);
+  }
 };

 namespace AMDGPU {
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1
 // Identical to VGPR_32 except it only contains the low 256 (Lo256) registers.
 def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
                                    (add (sequence "VGPR%u", 0, 255))> {
-  let AllocationPriority = 0;
+  let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor));
  let GeneratePressureSet = 0;
  let Size = 32;
  let Weight = 1;
--- a/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll
@ -1,9 +1,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s

-; FIXME: Scale operands of WMMA are limited to low 256 VGPRs
-;        currently we are spilling it because all low VGPRs are occupied even though our budget is higher.
+; Scale operands of WMMA are limited to low 256 VGPRs
 ; Make sure we do not spill scale operands because of the low 256 restriction.
-; CHECK: ; ScratchSize: 12
+; CHECK: ; ScratchSize: 0
 ; CHECK: ; Occupancy: 1

 define amdgpu_kernel void @spill_scale_test(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <16 x i32> %arg8, float %arg9, <16 x i32> %arg10, float %arg11, <16 x i8> %arg12) #0 {