Compare commits

...

1 Commits

Author SHA1 Message Date
Matt Arsenault
a32d42e594 AMDGPU: Simplify foldImmediate with register class based checks
Generalize the code over the properties of the mov instruction,
rather than maintaining parallel logic to figure out the type
of mov to use. I've maintained the behavior with 16-bit physical
SGPRs, though I think the behavior here is broken and corrupting
any value that happens to be live in the high bits. It just happens
there's no way to separately write to those with a real instruction
but I don't think we should be trying to make assumptions around
that property.

This is NFC-ish. It now does a better job with imm pseudos which
practically won't reach here. This also will make it easier
to support more folds in a future patch.

I added a couple of new tests with 16-bit extract of 64-bit sources.
The only other test change is an immediate rendering change from
zero extended to sign extended.
2025-08-22 20:31:12 +09:00
2 changed files with 136 additions and 40 deletions

View File

@ -3573,54 +3573,93 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
Register DstReg = UseMI.getOperand(0).getReg();
unsigned OpSize = getOpSize(UseMI, 0);
bool Is16Bit = OpSize == 2;
bool Is64Bit = OpSize == 8;
bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
: AMDGPU::V_MOV_B32_e32
: Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
: AMDGPU::S_MOV_B32;
Register UseSubReg = UseMI.getOperand(1).getSubReg();
std::optional<int64_t> SubRegImm =
extractSubregFromImm(Imm, UseMI.getOperand(1).getSubReg());
const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);
APInt Imm(Is64Bit ? 64 : 32, *SubRegImm,
/*isSigned=*/true, /*implicitTrunc=*/true);
if (RI.isAGPR(*MRI, DstReg)) {
if (Is64Bit || !isInlineConstant(Imm))
return false;
NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
}
bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
RI.getSubRegIdxSize(UseSubReg) == 16;
if (Is16Bit) {
if (isVGPRCopy)
if (RI.hasVGPRs(DstRC))
return false; // Do not clobber vgpr_hi16
if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
return false;
UseMI.getOperand(0).setSubReg(0);
if (DstReg.isPhysical()) {
DstReg = RI.get32BitRegister(DstReg);
UseMI.getOperand(0).setReg(DstReg);
}
assert(UseMI.getOperand(1).getReg().isVirtual());
}
MachineFunction *MF = UseMI.getMF();
const MCInstrDesc &NewMCID = get(NewOpc);
const TargetRegisterClass *NewDefRC = getRegClass(NewMCID, 0, &RI, *MF);
if (DstReg.isPhysical()) {
if (!NewDefRC->contains(DstReg))
return false;
} else if (!MRI->constrainRegClass(DstReg, NewDefRC))
unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
MCRegister MovDstPhysReg =
DstReg.isPhysical() ? MCRegister(DstReg) : MCRegister();
std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
// TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
for (unsigned MovOp :
{AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
const MCInstrDesc &MovDesc = get(MovOp);
const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI, *MF);
if (Is16Bit) {
// We just need to find a correctly sized register class, so the
// subregister index compatibility doesn't matter since we're statically
// extracting the immediate value.
MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
if (!MovDstRC)
continue;
if (MovDstPhysReg) {
// FIXME: We probably should not do this. If there is a live value in
// the high half of the register, it will be corrupted.
MovDstPhysReg = RI.getMatchingSuperReg(MCRegister(DstReg),
AMDGPU::lo16, MovDstRC);
if (!MovDstPhysReg)
continue;
}
}
// Result class isn't the right size, try the next instruction.
if (MovDstPhysReg) {
if (!MovDstRC->contains(MovDstPhysReg))
return false;
} else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
// TODO: This will be overly conservative in the case of 16-bit virtual
// SGPRs. We could hack up the virtual register uses to use a compatible
// 32-bit class.
continue;
}
const MCOperandInfo &OpInfo = MovDesc.operands()[1];
// Ensure the interpreted immediate value is a valid operand in the new
// mov.
//
// FIXME: isImmOperandLegal should have form that doesn't require existing
// MachineInstr or MachineOperand
if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
!isInlineConstant(*SubRegImm, OpInfo.OperandType))
break;
NewOpc = MovOp;
break;
}
if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
return false;
if (Is16Bit) {
UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
if (MovDstPhysReg)
UseMI.getOperand(0).setReg(MovDstPhysReg);
assert(UseMI.getOperand(1).getReg().isVirtual());
}
const MCInstrDesc &NewMCID = get(NewOpc);
UseMI.setDesc(NewMCID);
UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
UseMI.addImplicitDefUseOperands(*MF);
return true;
}

View File

@ -188,8 +188,7 @@ body: |
; GCN-LABEL: name: fold_sreg_64_to_sreg_64
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B]]
; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B64_]]
%0:sreg_64 = S_MOV_B64 1311768467750121200
%1:sreg_64 = COPY killed %0
SI_RETURN_TO_EPILOG %1
@ -761,8 +760,8 @@ body: |
bb.0:
; GCN-LABEL: name: fold_av_mov_b32_imm_pseudo_inlineimm_to_av
; GCN: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec
; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY killed [[AV_MOV_]]
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]]
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 64, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B32_e32_]]
%0:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec
%1:av_32 = COPY killed %0
SI_RETURN_TO_EPILOG implicit %1
@ -800,9 +799,67 @@ body: |
bb.0:
; GCN-LABEL: name: fold_av_mov_b64_imm_pseudo_inlineimm_to_av
; GCN: [[AV_MOV_:%[0-9]+]]:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
; GCN-NEXT: [[COPY:%[0-9]+]]:av_64_align2 = COPY killed [[AV_MOV_]]
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]]
; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 64, implicit $exec
; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
%0:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
%1:av_64_align2 = COPY killed %0
SI_RETURN_TO_EPILOG implicit %1
...
---
name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16
body: |
bb.0:
; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].lo16
; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
%0:sreg_64 = S_MOV_B64 64
%1:sgpr_lo16 = COPY killed %0.lo16
SI_RETURN_TO_EPILOG %1
...
---
name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16
body: |
bb.0:
; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].hi16
; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
%0:sreg_64 = S_MOV_B64 64
%1:sgpr_lo16 = COPY killed %0.hi16
SI_RETURN_TO_EPILOG %1
...
---
name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo
body: |
bb.0:
; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
; GCN-NEXT: $sgpr0 = S_MOV_B32 64
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
%0:sreg_64 = S_MOV_B64 64
$sgpr0_lo16 = COPY killed %0.lo16
SI_RETURN_TO_EPILOG $sgpr0_lo16
...
---
name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo
body: |
bb.0:
; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo
; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
; GCN-NEXT: $sgpr0 = S_MOV_B32 0
; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
%0:sreg_64 = S_MOV_B64 64
$sgpr0_lo16 = COPY killed %0.hi16
SI_RETURN_TO_EPILOG $sgpr0_lo16
...