AMDGPU: Simplify foldImmediate with register class based checks

Generalize the code over the properties of the mov instruction, rather than maintaining parallel logic to figure out the type of mov to use. I've maintained the behavior with 16-bit physical SGPRs, though I think the behavior here is broken and corrupting any value that happens to be live in the high bits. It just happens there's no way to separately write to those with a real instruction but I don't think we should be trying to make assumptions around that property. This is NFC-ish. It now does a better job with imm pseudos which practically won't reach here. This also will make it easier to support more folds in a future patch. I added a couple of new tests with 16-bit extract of 64-bit sources. The only other test change is an immediate rendering change from zero extended to sign extended.
2025-08-21 12:49:10 +09:00 · 2025-08-21 12:49:10 +09:00 · a32d42e594
commit a32d42e594
parent d2b810e24f
2 changed files with 136 additions and 40 deletions
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -3573,54 +3573,93 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
    assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");

    Register DstReg = UseMI.getOperand(0).getReg();
-    unsigned OpSize = getOpSize(UseMI, 0);
-    bool Is16Bit = OpSize == 2;
-    bool Is64Bit = OpSize == 8;
-    bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
-    unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
-                                           : AMDGPU::V_MOV_B32_e32
-                                 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
-                                           : AMDGPU::S_MOV_B32;
+    Register UseSubReg = UseMI.getOperand(1).getSubReg();

-    std::optional<int64_t> SubRegImm =
-        extractSubregFromImm(Imm, UseMI.getOperand(1).getSubReg());
+    const TargetRegisterClass *DstRC = RI.getRegClassForReg(*MRI, DstReg);

-    APInt Imm(Is64Bit ? 64 : 32, *SubRegImm,
-              /*isSigned=*/true, /*implicitTrunc=*/true);
-
-    if (RI.isAGPR(*MRI, DstReg)) {
-      if (Is64Bit || !isInlineConstant(Imm))
-        return false;
-      NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
-    }
+    bool Is16Bit = UseSubReg != AMDGPU::NoSubRegister &&
+                   RI.getSubRegIdxSize(UseSubReg) == 16;

    if (Is16Bit) {
-      if (isVGPRCopy)
+      if (RI.hasVGPRs(DstRC))
        return false; // Do not clobber vgpr_hi16

-      if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
+      if (DstReg.isVirtual() && UseSubReg != AMDGPU::lo16)
        return false;
-
-      UseMI.getOperand(0).setSubReg(0);
-      if (DstReg.isPhysical()) {
-        DstReg = RI.get32BitRegister(DstReg);
-        UseMI.getOperand(0).setReg(DstReg);
-      }
-      assert(UseMI.getOperand(1).getReg().isVirtual());
    }

    MachineFunction *MF = UseMI.getMF();
-    const MCInstrDesc &NewMCID = get(NewOpc);
-    const TargetRegisterClass *NewDefRC = getRegClass(NewMCID, 0, &RI, *MF);

-    if (DstReg.isPhysical()) {
-      if (!NewDefRC->contains(DstReg))
-        return false;
-    } else if (!MRI->constrainRegClass(DstReg, NewDefRC))
+    unsigned NewOpc = AMDGPU::INSTRUCTION_LIST_END;
+    MCRegister MovDstPhysReg =
+        DstReg.isPhysical() ? MCRegister(DstReg) : MCRegister();
+
+    std::optional<int64_t> SubRegImm = extractSubregFromImm(Imm, UseSubReg);
+
+    // TODO: Try to fold with AMDGPU::V_MOV_B16_t16_e64
+    for (unsigned MovOp :
+         {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
+          AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_ACCVGPR_WRITE_B32_e64}) {
+      const MCInstrDesc &MovDesc = get(MovOp);
+
+      const TargetRegisterClass *MovDstRC = getRegClass(MovDesc, 0, &RI, *MF);
+      if (Is16Bit) {
+        // We just need to find a correctly sized register class, so the
+        // subregister index compatibility doesn't matter since we're statically
+        // extracting the immediate value.
+        MovDstRC = RI.getMatchingSuperRegClass(MovDstRC, DstRC, AMDGPU::lo16);
+        if (!MovDstRC)
+          continue;
+
+        if (MovDstPhysReg) {
+          // FIXME: We probably should not do this. If there is a live value in
+          // the high half of the register, it will be corrupted.
+          MovDstPhysReg = RI.getMatchingSuperReg(MCRegister(DstReg),
+                                                 AMDGPU::lo16, MovDstRC);
+          if (!MovDstPhysReg)
+            continue;
+        }
+      }
+
+      // Result class isn't the right size, try the next instruction.
+      if (MovDstPhysReg) {
+        if (!MovDstRC->contains(MovDstPhysReg))
+          return false;
+      } else if (!MRI->constrainRegClass(DstReg, MovDstRC)) {
+        // TODO: This will be overly conservative in the case of 16-bit virtual
+        // SGPRs. We could hack up the virtual register uses to use a compatible
+        // 32-bit class.
+        continue;
+      }
+
+      const MCOperandInfo &OpInfo = MovDesc.operands()[1];
+
+      // Ensure the interpreted immediate value is a valid operand in the new
+      // mov.
+      //
+      // FIXME: isImmOperandLegal should have form that doesn't require existing
+      // MachineInstr or MachineOperand
+      if (!RI.opCanUseLiteralConstant(OpInfo.OperandType) &&
+          !isInlineConstant(*SubRegImm, OpInfo.OperandType))
+        break;
+
+      NewOpc = MovOp;
+      break;
+    }
+
+    if (NewOpc == AMDGPU::INSTRUCTION_LIST_END)
      return false;

+    if (Is16Bit) {
+      UseMI.getOperand(0).setSubReg(AMDGPU::NoSubRegister);
+      if (MovDstPhysReg)
+        UseMI.getOperand(0).setReg(MovDstPhysReg);
+      assert(UseMI.getOperand(1).getReg().isVirtual());
+    }
+
+    const MCInstrDesc &NewMCID = get(NewOpc);
    UseMI.setDesc(NewMCID);
-    UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
+    UseMI.getOperand(1).ChangeToImmediate(*SubRegImm);
    UseMI.addImplicitDefUseOperands(*MF);
    return true;
  }
--- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
@ -188,8 +188,7 @@ body:             |

    ; GCN-LABEL: name: fold_sreg_64_to_sreg_64
    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 1311768467750121200
-    ; GCN-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1311768467750121200
-    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B]]
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_MOV_B64_]]
    %0:sreg_64 = S_MOV_B64 1311768467750121200
    %1:sreg_64 = COPY killed %0
    SI_RETURN_TO_EPILOG %1
@ -761,8 +760,8 @@ body:             |
  bb.0:
    ; GCN-LABEL: name: fold_av_mov_b32_imm_pseudo_inlineimm_to_av
    ; GCN: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY killed [[AV_MOV_]]
-    ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]]
+    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 64, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B32_e32_]]
    %0:av_32 = AV_MOV_B32_IMM_PSEUDO 64, implicit $exec
    %1:av_32 = COPY killed %0
    SI_RETURN_TO_EPILOG implicit %1
@ -800,9 +799,67 @@ body:             |
  bb.0:
    ; GCN-LABEL: name: fold_av_mov_b64_imm_pseudo_inlineimm_to_av
    ; GCN: [[AV_MOV_:%[0-9]+]]:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:av_64_align2 = COPY killed [[AV_MOV_]]
-    ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[COPY]]
+    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 64, implicit $exec
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
    %0:av_64_align2 = AV_MOV_B64_IMM_PSEUDO 64, implicit $exec
    %1:av_64_align2 = COPY killed %0
    SI_RETURN_TO_EPILOG implicit %1
 ...
+
+---
+name:            fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_virt_sgpr16
+    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].lo16
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
+    %0:sreg_64 = S_MOV_B64 64
+    %1:sgpr_lo16 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG %1
+
+...
+---
+name:            fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_virt_sgpr16
+    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B64_]].hi16
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG [[COPY]]
+    %0:sreg_64 = S_MOV_B64 64
+    %1:sgpr_lo16 = COPY killed %0.hi16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_simm_16_sub_to_lo_from_mov_64_phys_sgpr16_lo
+    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 64
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
+    %0:sreg_64 = S_MOV_B64 64
+    $sgpr0_lo16 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...
+---
+name:            fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_simm_16_sub_to_hi_from_mov_64_inline_imm_phys_sgpr16_lo
+    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 64
+    ; GCN-NEXT: $sgpr0 = S_MOV_B32 0
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG $sgpr0_lo16
+    %0:sreg_64 = S_MOV_B64 64
+    $sgpr0_lo16 = COPY killed %0.hi16
+    SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...