[X86] apply mulx optimization for two-wide mul instruction (mull, mulq) (#185127)

References: https://github.com/llvm/llvm-project/pull/184462 In the discussion for the linked PR, which removes unnecessary register to register moves when one operand is in %rdx for mulx, the point was brought up that this pattern also happens for mull and mulq. The IR below: ```llvm declare i32 @foo32() declare i64 @foo64() define i32 @mul32_no_implicit_copy(i32 %a0) { %a1 = call i32 @foo32() %a2 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a0, i32 %a1) %a3 = extractvalue { i32, i1 } %a2, 0 ret i32 %a3 } define i64 @mul64_no_implicit_copy(i64 %a0) { %a1 = call i64 @foo64() %a2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a0, i64 %a1) %a3 = extractvalue { i64, i1 } %a2, 0 ret i64 %a3 } ``` Generates this code on current HEAD: ```asm mul32_no_implicit_copy: # @mul32_no_implicit_copy push rbx mov ebx, edi call foo32@PLT mov ecx, eax mov eax, ebx mul ecx pop rbx ret mul64_no_implicit_copy: # @mul64_no_implicit_copy push rbx mov rbx, rdi call foo64@PLT mov rcx, rax mov rax, rbx mul rcx pop rbx ret ``` Where the register shuffling before the mul is the same pattern as for mulx in the previous PR. With this branch it generates this code now: ```asm mul32_no_implicit_copy: pushq %rbx movl %edi, %ebx callq foo32@PLT mull %ebx popq %rbx retq mul64_no_implicit_copy: pushq %rbx movq %rdi, %rbx callq foo64@PLT mulq %rbx popq %rbx retq ```
2026-03-14 09:02:30 -04:00 · 2026-03-14 09:02:30 -04:00 · 1b85c6322d
commit 1b85c6322d
parent 54dca1e431
2 changed files with 63 additions and 17 deletions
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@ -5050,6 +5050,21 @@ VPTESTM_CASE(v32i16, WZ##SUFFIX)
 #undef VPTESTM_CASE
 }

+static void orderRegForMul(SDValue &N0, SDValue &N1, const unsigned LoReg,
+                           const MachineRegisterInfo &MRI) {
+  auto GetPhysReg = [&](SDValue V) -> Register {
+    if (V.getOpcode() != ISD::CopyFromReg)
+      return Register();
+    Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
+    if (Reg.isVirtual())
+      return MRI.getLiveInPhysReg(Reg);
+    return Reg;
+  };
+
+  if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
+    std::swap(N0, N1);
+}
+
 // Try to create VPTESTM instruction. If InMask is not null, it will be used
 // to form a masked operation.
 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
@ -5796,6 +5811,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
        std::swap(N0, N1);
    }

+    // UMUL/SMUL have an implicit source in LoReg (AL/AX/EAX/RAX). Prefer the
+    // operand that's already there to avoid an extra register-to-register move.
+    if (!FoldedLoad)
+      orderRegForMul(N0, N1, LoReg, CurDAG->getMachineFunction().getRegInfo());
+
    SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                          N0, SDValue()).getValue(1);

@ -5882,23 +5902,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
        std::swap(N0, N1);
    }

-    // For MULX, the implicit source must be in RDX (LoReg). If N1 is
-    // already a CopyFromReg of LoReg and N0 is not, flip so that N0
-    // (which feeds the CopyToReg below) is the operand already in LoReg,
-    // avoiding an unnecessary register-to-register copy before the multiply.
-    if (UseMULX && !foldedLoad) {
-      MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
-      auto GetPhysReg = [&](SDValue V) -> Register {
-        if (V.getOpcode() != ISD::CopyFromReg)
-          return Register();
-        Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
-        if (Reg.isVirtual())
-          return MRI.getLiveInPhysReg(Reg);
-        return Reg;
-      };
-      if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
-        std::swap(N0, N1);
-    }
+    // UMUL/SMUL_LOHI has an implicit source in LoReg (RDX for MULX, RAX for
+    // MUL/IMUL). Prefer the operand that's already there.
+    if (!foldedLoad)
+      orderRegForMul(N0, N1, LoReg, CurDAG->getMachineFunction().getRegInfo());

    SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                          N0, SDValue()).getValue(1);
--- a/llvm/test/CodeGen/X86/mul-lohi-no-implicit-copy.ll
+++ b/llvm/test/CodeGen/X86/mul-lohi-no-implicit-copy.ll
@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi2 | FileCheck %s
+;
+; For UMUL_LOHI lowering without BMI2, MUL/IMUL use RAX as an implicit source.
+; If one operand already lives in RAX (e.g. call result), avoid shuffling it out
+; and back before the multiply.
+
+declare i32 @foo32()
+declare i64 @foo64()
+
+define i32 @mul32_no_implicit_copy(i32 %a0) nounwind {
+; CHECK-LABEL: mul32_no_implicit_copy:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    callq foo32@PLT
+; CHECK-NEXT:    mull %ebx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  %a1 = call i32 @foo32()
+  %a2 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a0, i32 %a1)
+  %a3 = extractvalue { i32, i1 } %a2, 0
+  ret i32 %a3
+}
+
+define i64 @mul64_no_implicit_copy(i64 %a0) nounwind {
+; CHECK-LABEL: mul64_no_implicit_copy:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    callq foo64@PLT
+; CHECK-NEXT:    mulq %rbx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  %a1 = call i64 @foo64()
+  %a2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a0, i64 %a1)
+  %a3 = extractvalue { i64, i1 } %a2, 0
+  ret i64 %a3
+}