[X86] apply mulx optimization for two-wide mul instruction (mull, mulq) (#185127)
References: https://github.com/llvm/llvm-project/pull/184462 In the discussion for the linked PR, which removes unnecessary register to register moves when one operand is in %rdx for mulx, the point was brought up that this pattern also happens for mull and mulq. The IR below: ```llvm declare i32 @foo32() declare i64 @foo64() define i32 @mul32_no_implicit_copy(i32 %a0) { %a1 = call i32 @foo32() %a2 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a0, i32 %a1) %a3 = extractvalue { i32, i1 } %a2, 0 ret i32 %a3 } define i64 @mul64_no_implicit_copy(i64 %a0) { %a1 = call i64 @foo64() %a2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a0, i64 %a1) %a3 = extractvalue { i64, i1 } %a2, 0 ret i64 %a3 } ``` Generates this code on current HEAD: ```asm mul32_no_implicit_copy: # @mul32_no_implicit_copy push rbx mov ebx, edi call foo32@PLT mov ecx, eax mov eax, ebx mul ecx pop rbx ret mul64_no_implicit_copy: # @mul64_no_implicit_copy push rbx mov rbx, rdi call foo64@PLT mov rcx, rax mov rax, rbx mul rcx pop rbx ret ``` Where the register shuffling before the mul is the same pattern as for mulx in the previous PR. With this branch it generates this code now: ```asm mul32_no_implicit_copy: pushq %rbx movl %edi, %ebx callq foo32@PLT mull %ebx popq %rbx retq mul64_no_implicit_copy: pushq %rbx movq %rdi, %rbx callq foo64@PLT mulq %rbx popq %rbx retq ```
This commit is contained in:
parent
54dca1e431
commit
1b85c6322d
@ -5050,6 +5050,21 @@ VPTESTM_CASE(v32i16, WZ##SUFFIX)
|
||||
#undef VPTESTM_CASE
|
||||
}
|
||||
|
||||
static void orderRegForMul(SDValue &N0, SDValue &N1, const unsigned LoReg,
|
||||
const MachineRegisterInfo &MRI) {
|
||||
auto GetPhysReg = [&](SDValue V) -> Register {
|
||||
if (V.getOpcode() != ISD::CopyFromReg)
|
||||
return Register();
|
||||
Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
|
||||
if (Reg.isVirtual())
|
||||
return MRI.getLiveInPhysReg(Reg);
|
||||
return Reg;
|
||||
};
|
||||
|
||||
if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
|
||||
std::swap(N0, N1);
|
||||
}
|
||||
|
||||
// Try to create VPTESTM instruction. If InMask is not null, it will be used
|
||||
// to form a masked operation.
|
||||
bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
|
||||
@ -5796,6 +5811,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
||||
std::swap(N0, N1);
|
||||
}
|
||||
|
||||
// UMUL/SMUL have an implicit source in LoReg (AL/AX/EAX/RAX). Prefer the
|
||||
// operand that's already there to avoid an extra register-to-register move.
|
||||
if (!FoldedLoad)
|
||||
orderRegForMul(N0, N1, LoReg, CurDAG->getMachineFunction().getRegInfo());
|
||||
|
||||
SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
|
||||
N0, SDValue()).getValue(1);
|
||||
|
||||
@ -5882,23 +5902,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
|
||||
std::swap(N0, N1);
|
||||
}
|
||||
|
||||
// For MULX, the implicit source must be in RDX (LoReg). If N1 is
|
||||
// already a CopyFromReg of LoReg and N0 is not, flip so that N0
|
||||
// (which feeds the CopyToReg below) is the operand already in LoReg,
|
||||
// avoiding an unnecessary register-to-register copy before the multiply.
|
||||
if (UseMULX && !foldedLoad) {
|
||||
MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
|
||||
auto GetPhysReg = [&](SDValue V) -> Register {
|
||||
if (V.getOpcode() != ISD::CopyFromReg)
|
||||
return Register();
|
||||
Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
|
||||
if (Reg.isVirtual())
|
||||
return MRI.getLiveInPhysReg(Reg);
|
||||
return Reg;
|
||||
};
|
||||
if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
|
||||
std::swap(N0, N1);
|
||||
}
|
||||
// UMUL/SMUL_LOHI has an implicit source in LoReg (RDX for MULX, RAX for
|
||||
// MUL/IMUL). Prefer the operand that's already there.
|
||||
if (!foldedLoad)
|
||||
orderRegForMul(N0, N1, LoReg, CurDAG->getMachineFunction().getRegInfo());
|
||||
|
||||
SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
|
||||
N0, SDValue()).getValue(1);
|
||||
|
||||
39
llvm/test/CodeGen/X86/mul-lohi-no-implicit-copy.ll
Normal file
39
llvm/test/CodeGen/X86/mul-lohi-no-implicit-copy.ll
Normal file
@ -0,0 +1,39 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi2 | FileCheck %s
|
||||
;
|
||||
; For UMUL_LOHI lowering without BMI2, MUL/IMUL use RAX as an implicit source.
|
||||
; If one operand already lives in RAX (e.g. call result), avoid shuffling it out
|
||||
; and back before the multiply.
|
||||
|
||||
declare i32 @foo32()
|
||||
declare i64 @foo64()
|
||||
|
||||
define i32 @mul32_no_implicit_copy(i32 %a0) nounwind {
|
||||
; CHECK-LABEL: mul32_no_implicit_copy:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: pushq %rbx
|
||||
; CHECK-NEXT: movl %edi, %ebx
|
||||
; CHECK-NEXT: callq foo32@PLT
|
||||
; CHECK-NEXT: mull %ebx
|
||||
; CHECK-NEXT: popq %rbx
|
||||
; CHECK-NEXT: retq
|
||||
%a1 = call i32 @foo32()
|
||||
%a2 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a0, i32 %a1)
|
||||
%a3 = extractvalue { i32, i1 } %a2, 0
|
||||
ret i32 %a3
|
||||
}
|
||||
|
||||
define i64 @mul64_no_implicit_copy(i64 %a0) nounwind {
|
||||
; CHECK-LABEL: mul64_no_implicit_copy:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: pushq %rbx
|
||||
; CHECK-NEXT: movq %rdi, %rbx
|
||||
; CHECK-NEXT: callq foo64@PLT
|
||||
; CHECK-NEXT: mulq %rbx
|
||||
; CHECK-NEXT: popq %rbx
|
||||
; CHECK-NEXT: retq
|
||||
%a1 = call i64 @foo64()
|
||||
%a2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a0, i64 %a1)
|
||||
%a3 = extractvalue { i64, i1 } %a2, 0
|
||||
ret i64 %a3
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user