[AMDGPU] Constrain register class during COPY elimination based on their uses (#181909)
When a WMMA scale operand (requiring `VCSrc_b32_Lo256`) was an SGPR that had been copied from a VGPR, the COPY elimination replaced all uses of the SGPR with the VGPR but failed to constrain the VGPR to `vgpr_32_lo256`. This allowed the register allocator to assign a VGPR >= 256 (e.g., $vgpr309), violating the hardware encoding requirement.
This commit is contained in:
parent
22ecdda485
commit
e0b3e82e98
@ -8319,6 +8319,26 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
|
||||
const TargetRegisterClass *SrcRC = RI.getRegClassForReg(MRI, NewDstReg);
|
||||
if (const TargetRegisterClass *CommonRC =
|
||||
RI.getCommonSubClass(NewDstRC, SrcRC)) {
|
||||
// Also intersect with VGPR-compatible operand register class
|
||||
// constraints from user instructions. This preserves restricted
|
||||
// register classes (e.g., VGPR_32_Lo256 for WMMA scale operands) that
|
||||
// would otherwise be lost when an SGPR is replaced with a VGPR.
|
||||
// Constraints incompatible with VGPRs (e.g., SALU instructions
|
||||
// requiring SReg_32) are skipped because those users will be converted
|
||||
// to VALU by the worklist.
|
||||
for (const MachineOperand &UseMO : MRI.use_operands(DstReg)) {
|
||||
const MachineInstr *UseMI = UseMO.getParent();
|
||||
if (UseMI == &Inst)
|
||||
continue;
|
||||
unsigned OpIdx = UseMI->getOperandNo(&UseMO);
|
||||
if (const TargetRegisterClass *OpRC =
|
||||
getRegClass(UseMI->getDesc(), OpIdx)) {
|
||||
if (const TargetRegisterClass *Narrowed =
|
||||
RI.getCommonSubClass(CommonRC, OpRC))
|
||||
CommonRC = Narrowed;
|
||||
}
|
||||
}
|
||||
|
||||
// Instead of creating a copy where src and dst are the same register
|
||||
// class, we just replace all uses of dst with src. These kinds of
|
||||
// copies interfere with the heuristics MachineSink uses to decide
|
||||
|
||||
@ -0,0 +1,65 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck %s
|
||||
|
||||
# Test that si-fix-sgpr-copies preserves the Lo256 register class constraint
|
||||
# when eliminating a VGPR-to-SGPR copy used as a WMMA scale operand.
|
||||
#
|
||||
# The scale_src0 and scale_src1 operands of V_WMMA_SCALE instructions require
|
||||
# registers from VCSrc_b32_Lo256 (VS_32_Lo256), which only allows VGPRs 0-255.
|
||||
# When si-fix-sgpr-copies eliminates a VGPR-to-SGPR copy by replacing uses of
|
||||
# the SGPR with the VGPR source, it must constrain the VGPR to vgpr_32_lo256
|
||||
# to preserve this hardware encoding requirement.
|
||||
|
||||
---
|
||||
name: wmma_scale_copy_vgpr_to_sgpr
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $sgpr0
|
||||
; CHECK-LABEL: name: wmma_scale_copy_vgpr_to_sgpr
|
||||
; CHECK: liveins: $vgpr0, $sgpr0
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32_lo256 = COPY $vgpr0
|
||||
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: early-clobber %6:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[COPY]], [[DEF3]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:sreg_32 = COPY %0
|
||||
%2:vreg_512_align2 = IMPLICIT_DEF
|
||||
%3:vreg_512_align2 = IMPLICIT_DEF
|
||||
%4:vreg_256_align2 = IMPLICIT_DEF
|
||||
%5:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
%6:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2, %3, 0, %4, %1, %5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
...
|
||||
|
||||
# Also test scale_src1 (operand 6) constraint is preserved.
|
||||
|
||||
---
|
||||
name: wmma_scale_copy_vgpr_to_sgpr_src1
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $sgpr0
|
||||
; CHECK-LABEL: name: wmma_scale_copy_vgpr_to_sgpr_src1
|
||||
; CHECK: liveins: $vgpr0, $sgpr0
|
||||
; CHECK-NEXT: {{ $}}
|
||||
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32_lo256 = COPY $vgpr0
|
||||
; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
; CHECK-NEXT: early-clobber %6:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
; CHECK-NEXT: S_ENDPGM 0
|
||||
%0:vgpr_32 = COPY $vgpr0
|
||||
%1:sreg_32 = COPY %0
|
||||
%2:vreg_512_align2 = IMPLICIT_DEF
|
||||
%3:vreg_512_align2 = IMPLICIT_DEF
|
||||
%4:vreg_256_align2 = IMPLICIT_DEF
|
||||
%5:vgpr_32_lo256 = IMPLICIT_DEF
|
||||
%6:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %2, %3, 0, %4, %5, %1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
...
|
||||
Loading…
x
Reference in New Issue
Block a user