[X86] apply mulx optimization for two-wide mul instruction (mull, mulq) (#185127)

References: https://github.com/llvm/llvm-project/pull/184462

In the discussion for the linked PR, which removes unnecessary register
to register moves when one operand is in %rdx for mulx, the point was
brought up that this pattern also happens for mull and mulq.

The IR below:

```llvm
declare i32 @foo32()
declare i64 @foo64()

define i32 @mul32_no_implicit_copy(i32 %a0) {
  %a1 = call i32 @foo32()
  %a2 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a0, i32 %a1)
  %a3 = extractvalue { i32, i1 } %a2, 0
  ret i32 %a3
}

define i64 @mul64_no_implicit_copy(i64 %a0) {
  %a1 = call i64 @foo64()
  %a2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a0, i64 %a1)
  %a3 = extractvalue { i64, i1 } %a2, 0
  ret i64 %a3
}
```

Generates this code on current HEAD:

```asm
mul32_no_implicit_copy:                 # @mul32_no_implicit_copy
        push    rbx
        mov     ebx, edi
        call    foo32@PLT
        mov     ecx, eax
        mov     eax, ebx
        mul     ecx
        pop     rbx
        ret
mul64_no_implicit_copy:                 # @mul64_no_implicit_copy
        push    rbx
        mov     rbx, rdi
        call    foo64@PLT
        mov     rcx, rax
        mov     rax, rbx
        mul     rcx
        pop     rbx
        ret
```

Where the register shuffling before the mul is the same pattern as for
mulx in the previous PR.

With this branch it generates this code now:

```asm
mul32_no_implicit_copy:
	pushq	%rbx
	movl	%edi, %ebx
	callq	foo32@PLT
	mull	%ebx
	popq	%rbx
	retq
mul64_no_implicit_copy:
	pushq	%rbx
	movq	%rdi, %rbx
	callq	foo64@PLT
	mulq	%rbx
	popq	%rbx
	retq
```
This commit is contained in:
Takashi Idobe 2026-03-14 09:02:30 -04:00 committed by GitHub
parent 54dca1e431
commit 1b85c6322d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 63 additions and 17 deletions

View File

@ -5050,6 +5050,21 @@ VPTESTM_CASE(v32i16, WZ##SUFFIX)
#undef VPTESTM_CASE
}
static void orderRegForMul(SDValue &N0, SDValue &N1, const unsigned LoReg,
const MachineRegisterInfo &MRI) {
auto GetPhysReg = [&](SDValue V) -> Register {
if (V.getOpcode() != ISD::CopyFromReg)
return Register();
Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
if (Reg.isVirtual())
return MRI.getLiveInPhysReg(Reg);
return Reg;
};
if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
std::swap(N0, N1);
}
// Try to create VPTESTM instruction. If InMask is not null, it will be used
// to form a masked operation.
bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
@ -5796,6 +5811,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
std::swap(N0, N1);
}
// UMUL/SMUL have an implicit source in LoReg (AL/AX/EAX/RAX). Prefer the
// operand that's already there to avoid an extra register-to-register move.
if (!FoldedLoad)
orderRegForMul(N0, N1, LoReg, CurDAG->getMachineFunction().getRegInfo());
SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
N0, SDValue()).getValue(1);
@ -5882,23 +5902,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
std::swap(N0, N1);
}
// For MULX, the implicit source must be in RDX (LoReg). If N1 is
// already a CopyFromReg of LoReg and N0 is not, flip so that N0
// (which feeds the CopyToReg below) is the operand already in LoReg,
// avoiding an unnecessary register-to-register copy before the multiply.
if (UseMULX && !foldedLoad) {
MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
auto GetPhysReg = [&](SDValue V) -> Register {
if (V.getOpcode() != ISD::CopyFromReg)
return Register();
Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
if (Reg.isVirtual())
return MRI.getLiveInPhysReg(Reg);
return Reg;
};
if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
std::swap(N0, N1);
}
// UMUL/SMUL_LOHI has an implicit source in LoReg (RDX for MULX, RAX for
// MUL/IMUL). Prefer the operand that's already there.
if (!foldedLoad)
orderRegForMul(N0, N1, LoReg, CurDAG->getMachineFunction().getRegInfo());
SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
N0, SDValue()).getValue(1);

View File

@ -0,0 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi2 | FileCheck %s
;
; For UMUL_LOHI lowering without BMI2, MUL/IMUL use RAX as an implicit source.
; If one operand already lives in RAX (e.g. call result), avoid shuffling it out
; and back before the multiply.
declare i32 @foo32()
declare i64 @foo64()
define i32 @mul32_no_implicit_copy(i32 %a0) nounwind {
; CHECK-LABEL: mul32_no_implicit_copy:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movl %edi, %ebx
; CHECK-NEXT: callq foo32@PLT
; CHECK-NEXT: mull %ebx
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: retq
%a1 = call i32 @foo32()
%a2 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a0, i32 %a1)
%a3 = extractvalue { i32, i1 } %a2, 0
ret i32 %a3
}
define i64 @mul64_no_implicit_copy(i64 %a0) nounwind {
; CHECK-LABEL: mul64_no_implicit_copy:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: callq foo64@PLT
; CHECK-NEXT: mulq %rbx
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: retq
%a1 = call i64 @foo64()
%a2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a0, i64 %a1)
%a3 = extractvalue { i64, i1 } %a2, 0
ret i64 %a3
}