
This changes the RC priorities such that AVRegClass is the least prioritized. These registers are less constrained than the VRegClass and ARegClass as they can be either agpr or vgpr. Thus, assigning them last removes unnecessary constraints from VRegClass and ARegClass assignments, and allows the RA to make smarter decisions about whether to use vgpr / agpr for AVRegClass. We only have 5 bits for RC priorities, and we still want to prioritize larger RCs over smaller ones. Since this new prioritization uses the 5th bit for AVRegClass vs ARegClass / VRegClass, we only have 4 bits to encode the size priorities. Previously, each RC with a distinct size, had a distinct priority. However, this PR groups together multiple sizes to the same priority. Currently, this will have no effect on prioritization in practice because we only have one actually defined RC per group per vector register type. For example, a register class with 15 or 16 32bit registers will have the same size priority (14). However, we only have VReg_512 (VReg_480 doesn't exist), so only one actual RC in VRegClass has this priority. Similarly, we give register class with 17-32+ 32 bit registers a size priority of 15, but we only have VReg_1024. The effect of this PR is to prioritize first the vector register type (VReg & Areg have top priority, then AVReg), with the size of the register class having second priority. Passes PSDB. --------- Co-authored-by: Matt Arsenault <Matthew.Arsenault@amd.com>
187 lines
12 KiB
YAML
187 lines
12 KiB
YAML
# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -simplify-mir -start-before=greedy,2 -stress-regalloc=4 -stop-before=virtregrewriter,2 -o - -verify-regalloc %s 2> %t.err | FileCheck %s
|
|
# RUN: FileCheck -check-prefix=ERR %s < %t.err
|
|
|
|
# To allocate the vreg_512_align2, the allocation will attempt to
|
|
# inflate the register class to av_512_align2. This will ultimately
|
|
# not work, and the allocation will fail. There is an unproductive
|
|
# live range split, and we end up with a snippet copy of an
|
|
# unspillable register. Recursive assignment of interfering ranges
|
|
# during last chance recoloring would delete the unspillable snippet
|
|
# live range. Make sure there's no use after free when rolling back
|
|
# the last chance assignment.
|
|
|
|
# ERR: error: <unknown>:0:0: ran out of registers during register allocation in function 'inflated_reg_class_copy_use_after_free'
|
|
# ERR: error: <unknown>:0:0: ran out of registers during register allocation in function 'inflated_reg_class_copy_use_after_free_lane_subset'
|
|
|
|
--- |
|
|
define amdgpu_kernel void @inflated_reg_class_copy_use_after_free() {
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @inflated_reg_class_copy_use_after_free_lane_subset() {
|
|
ret void
|
|
}
|
|
|
|
...
|
|
|
|
# CHECK-LABEL: name: inflated_reg_class_copy_use_after_free
|
|
# CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3
|
|
# CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
|
|
# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
|
|
# CHECK-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[RESTORE0]], 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
|
|
# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 {
|
|
# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0
|
|
# CHECK-NEXT: }
|
|
# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
|
|
# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
|
|
# CHECK-NEXT: }
|
|
# CHECK-NEXT: [[RESTORE2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
|
|
# CHECK-NEXT: [[MFMA_USE1:%[0-9]+]].sub0_sub1:vreg_512_align2 = COPY [[RESTORE2]].sub0_sub1
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub2:vreg_512_align2 = COPY [[SPLIT2]].sub3
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub3:vreg_512_align2 = COPY [[SPLIT2]].sub2
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub4:vreg_512_align2 = COPY [[SPLIT2]].sub0
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]].sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[MFMA_USE1]]:vreg_512_align2 = V_MFMA_F32_16X16X1F32_mac_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[MFMA_USE1]], 0, 0, 0, implicit $mode, implicit $exec
|
|
|
|
---
|
|
name: inflated_reg_class_copy_use_after_free
|
|
tracksRegLiveness: true
|
|
machineFunctionInfo:
|
|
isEntryFunction: true
|
|
scratchRSrcReg: '$sgpr72_sgpr73_sgpr74_sgpr75'
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
occupancy: 7
|
|
vgprForAGPRCopy: '$vgpr255'
|
|
sgprForEXECCopy: '$sgpr74_sgpr75'
|
|
body: |
|
|
bb.0:
|
|
liveins: $vgpr0, $sgpr4_sgpr5
|
|
|
|
%0:vgpr_32 = IMPLICIT_DEF
|
|
renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4)
|
|
S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2
|
|
S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2
|
|
S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2
|
|
S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2
|
|
early-clobber %2:vreg_512_align2 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, %1, 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec
|
|
%1.sub2:vreg_512_align2 = COPY %2.sub3
|
|
%1.sub3:vreg_512_align2 = COPY %2.sub2
|
|
%1.sub4:vreg_512_align2 = COPY %2.sub0
|
|
%1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1:vreg_512_align2 = V_MFMA_F32_16X16X1F32_mac_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, %1, 0, 0, 0, implicit $mode, implicit $exec
|
|
GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1)
|
|
S_ENDPGM 0
|
|
|
|
...
|
|
|
|
# This test is similar to except it is still broken when the use
|
|
# instruction does not read the full set of lanes after one attempted fix.
|
|
|
|
# CHECK-LABEL: name: inflated_reg_class_copy_use_after_free_lane_subset
|
|
# CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3
|
|
# CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5)
|
|
# CHECK-NEXT: [[RESTORE_0:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
|
|
# CHECK-NEXT: S_NOP 0, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit [[RESTORE_0]].sub0_sub1_sub2_sub3, implicit [[RESTORE_0]].sub4_sub5_sub6_sub7
|
|
# CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 {
|
|
# CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0
|
|
# CHECK-NEXT: }
|
|
# CHECK-NEXT: undef [[SPLIT1:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT0]].sub2_sub3 {
|
|
# CHECK-NEXT: internal [[SPLIT1]].sub0:av_512_align2 = COPY [[SPLIT0]].sub0
|
|
# CHECK-NEXT: }
|
|
# CHECK-NEXT: undef [[SPLIT2:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT1]].sub2_sub3 {
|
|
# CHECK-NEXT: internal [[SPLIT2]].sub0:av_512_align2 = COPY [[SPLIT1]].sub0
|
|
# CHECK-NEXT: }
|
|
# CHECK-NEXT: SI_SPILL_AV512_SAVE [[SPLIT2]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
|
|
# CHECK-NEXT: [[RESTORE_1:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5)
|
|
# CHECK-NEXT: undef [[SPLIT3:%[0-9]+]].sub0_sub1:av_512_align2 = COPY [[RESTORE_1]].sub0_sub1
|
|
# CHECK-NEXT: [[RESTORE_2:%[0-9]+]]:av_512_align2 = SI_SPILL_AV512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
|
|
# CHECK-NEXT: undef [[SPLIT4:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[RESTORE_2]].sub2_sub3 {
|
|
# CHECK-NEXT: internal [[SPLIT4]].sub0:av_512_align2 = COPY [[RESTORE_2]].sub0
|
|
# CHECK-NEXT: }
|
|
# CHECK-NEXT: undef [[SPLIT5:%[0-9]+]].sub2_sub3:av_512_align2 = COPY [[SPLIT4]].sub2_sub3 {
|
|
# CHECK-NEXT: internal [[SPLIT5]].sub0:av_512_align2 = COPY [[SPLIT4]].sub0
|
|
# CHECK-NEXT: }
|
|
# CHECK-NEXT: [[SPLIT3]].sub2:av_512_align2 = COPY [[SPLIT5]].sub3
|
|
# CHECK-NEXT: undef [[SPLIT6:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT3]].sub0_sub1_sub2
|
|
# CHECK-NEXT: undef [[SPLIT7:%[0-9]+]].sub0_sub1_sub2:av_512_align2 = COPY [[SPLIT6]].sub0_sub1_sub2
|
|
# CHECK-NEXT: undef [[SPLIT8:%[0-9]+]].sub0:av_512_align2 = COPY [[SPLIT5]].sub0 {
|
|
# CHECK-NEXT: internal [[SPLIT8]].sub2:av_512_align2 = COPY [[SPLIT5]].sub2
|
|
# CHECK-NEXT: }
|
|
# CHECK-NEXT: [[SPLIT7]].sub3:av_512_align2 = COPY [[SPLIT8]].sub2
|
|
# CHECK-NEXT: undef [[SPLIT9:%[0-9]+]].sub0_sub1_sub2_sub3:av_512_align2 = COPY [[SPLIT7]].sub0_sub1_sub2_sub3
|
|
# CHECK-NEXT: undef [[LAST_USE:%[0-9]+]].sub0_sub1_sub2_sub3:vreg_512_align2 = COPY [[SPLIT9]].sub0_sub1_sub2_sub3
|
|
# CHECK-NEXT: [[LAST_USE]].sub4:vreg_512_align2 = COPY [[SPLIT8]].sub0
|
|
# CHECK-NEXT: [[LAST_USE]].sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[LAST_USE]].sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[LAST_USE]].sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[LAST_USE]].sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[LAST_USE]].sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[LAST_USE]].sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[LAST_USE]].sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[LAST_USE]].sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[LAST_USE]].sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[LAST_USE]].sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: [[LAST_USE]].sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
# CHECK-NEXT: S_NOP 0, implicit-def [[LAST_USE]], implicit [[LAST_USE]].sub0_sub1_sub2_sub3, implicit [[LAST_USE]].sub4_sub5_sub6_sub7, implicit [[LAST_USE]].sub8_sub9_sub10_sub11
|
|
|
|
---
|
|
name: inflated_reg_class_copy_use_after_free_lane_subset
|
|
tracksRegLiveness: true
|
|
machineFunctionInfo:
|
|
isEntryFunction: true
|
|
scratchRSrcReg: '$sgpr72_sgpr73_sgpr74_sgpr75'
|
|
stackPtrOffsetReg: '$sgpr32'
|
|
occupancy: 7
|
|
vgprForAGPRCopy: '$vgpr255'
|
|
sgprForEXECCopy: '$sgpr74_sgpr75'
|
|
body: |
|
|
bb.0:
|
|
liveins: $vgpr0, $sgpr4_sgpr5
|
|
|
|
%0:vgpr_32 = IMPLICIT_DEF
|
|
renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed undef renamable $sgpr4_sgpr5, 0, 0 :: (load (s64), addrspace 4)
|
|
S_NOP 0, implicit-def undef %1.sub12_sub13_sub14_sub15:vreg_512_align2
|
|
S_NOP 0, implicit-def %1.sub8_sub9_sub10_sub11:vreg_512_align2
|
|
S_NOP 0, implicit-def %1.sub4_sub5_sub6_sub7:vreg_512_align2
|
|
S_NOP 0, implicit-def %1.sub0_sub1_sub2_sub3:vreg_512_align2
|
|
S_NOP 0, implicit-def early-clobber %2:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7
|
|
%1.sub2:vreg_512_align2 = COPY %2.sub3
|
|
%1.sub3:vreg_512_align2 = COPY %2.sub2
|
|
%1.sub4:vreg_512_align2 = COPY %2.sub0
|
|
%1.sub5:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub6:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub7:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub8:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub9:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub10:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub11:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub12:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub13:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub14:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
%1.sub15:vreg_512_align2 = V_MOV_B32_e32 0, implicit $exec
|
|
S_NOP 0, implicit-def %1:vreg_512_align2, implicit %1.sub0_sub1_sub2_sub3, implicit %1.sub4_sub5_sub6_sub7, implicit %1.sub8_sub9_sub10_sub11
|
|
GLOBAL_STORE_DWORDX4_SADDR undef %3:vgpr_32, %1.sub12_sub13_sub14_sub15, undef renamable $sgpr0_sgpr1, 96, 0, implicit $exec :: (store (s128), addrspace 1)
|
|
S_ENDPGM 0
|
|
|
|
...
|