
We have two forceExpandWideMUL functions. One takes the low and high half of 2 inputs and calculates the low and high half of their product. This does not calculate the full 2x width product. The other signature takes 2 inputs and calculates the low and high half of their full 2x width product. Previously it did this by sign/zero extending the inputs to create the high bits and then calling the other function. We can instead copy the algorithm from the other function and use the Signed flag to determine whether we should do SRA or SRL. This avoids the need to multiply the high part of the inputs and add them to the high half of the result. This improves the generated code for signed multiplication. This should improve the performance of #123262. I don't know yet how close we will get to gcc.
94 lines
3.0 KiB
LLVM
94 lines
3.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
|
|
%0 = type { i64, i64 }
|
|
%1 = type { i128, i1 }
|
|
|
|
; This used to call muloti4, but that won't link with libgcc.
|
|
define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp {
|
|
; CHECK-LABEL: x:
|
|
; CHECK: ## %bb.0: ## %entry
|
|
; CHECK-NEXT: pushq %r14
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
|
; CHECK-NEXT: pushq %rbx
|
|
; CHECK-NEXT: .cfi_def_cfa_offset 24
|
|
; CHECK-NEXT: .cfi_offset %rbx, -24
|
|
; CHECK-NEXT: .cfi_offset %r14, -16
|
|
; CHECK-NEXT: movq %rdx, %r9
|
|
; CHECK-NEXT: movq %rsi, %r8
|
|
; CHECK-NEXT: movq %rsi, %rbx
|
|
; CHECK-NEXT: sarq $63, %rbx
|
|
; CHECK-NEXT: imulq %rdx, %rbx
|
|
; CHECK-NEXT: movq %rdi, %rax
|
|
; CHECK-NEXT: mulq %rdx
|
|
; CHECK-NEXT: movq %rdx, %r10
|
|
; CHECK-NEXT: movq %rax, %rsi
|
|
; CHECK-NEXT: movq %r8, %rax
|
|
; CHECK-NEXT: mulq %r9
|
|
; CHECK-NEXT: movq %rdx, %r9
|
|
; CHECK-NEXT: movq %rax, %r11
|
|
; CHECK-NEXT: addq %r10, %r11
|
|
; CHECK-NEXT: adcq %rbx, %r9
|
|
; CHECK-NEXT: movq %r9, %rbx
|
|
; CHECK-NEXT: sarq $63, %rbx
|
|
; CHECK-NEXT: movq %rcx, %r14
|
|
; CHECK-NEXT: sarq $63, %r14
|
|
; CHECK-NEXT: imulq %rdi, %r14
|
|
; CHECK-NEXT: movq %rdi, %rax
|
|
; CHECK-NEXT: mulq %rcx
|
|
; CHECK-NEXT: movq %rdx, %r10
|
|
; CHECK-NEXT: movq %rax, %rdi
|
|
; CHECK-NEXT: addq %r11, %rdi
|
|
; CHECK-NEXT: adcq %r14, %r10
|
|
; CHECK-NEXT: movq %r10, %r11
|
|
; CHECK-NEXT: sarq $63, %r11
|
|
; CHECK-NEXT: addq %r9, %r10
|
|
; CHECK-NEXT: adcq %rbx, %r11
|
|
; CHECK-NEXT: movq %r8, %rax
|
|
; CHECK-NEXT: imulq %rcx
|
|
; CHECK-NEXT: addq %r10, %rax
|
|
; CHECK-NEXT: adcq %r11, %rdx
|
|
; CHECK-NEXT: movq %rdi, %rcx
|
|
; CHECK-NEXT: sarq $63, %rcx
|
|
; CHECK-NEXT: xorq %rcx, %rdx
|
|
; CHECK-NEXT: xorq %rax, %rcx
|
|
; CHECK-NEXT: orq %rdx, %rcx
|
|
; CHECK-NEXT: jne LBB0_1
|
|
; CHECK-NEXT: ## %bb.2: ## %nooverflow
|
|
; CHECK-NEXT: movq %rsi, %rax
|
|
; CHECK-NEXT: movq %rdi, %rdx
|
|
; CHECK-NEXT: popq %rbx
|
|
; CHECK-NEXT: popq %r14
|
|
; CHECK-NEXT: retq
|
|
; CHECK-NEXT: LBB0_1: ## %overflow
|
|
; CHECK-NEXT: ud2
|
|
entry:
|
|
%tmp16 = zext i64 %a.coerce0 to i128
|
|
%tmp11 = zext i64 %a.coerce1 to i128
|
|
%tmp12 = shl nuw i128 %tmp11, 64
|
|
%ins14 = or i128 %tmp12, %tmp16
|
|
%tmp6 = zext i64 %b.coerce0 to i128
|
|
%tmp3 = zext i64 %b.coerce1 to i128
|
|
%tmp4 = shl nuw i128 %tmp3, 64
|
|
%ins = or i128 %tmp4, %tmp6
|
|
%0 = tail call %1 @llvm.smul.with.overflow.i128(i128 %ins14, i128 %ins)
|
|
%1 = extractvalue %1 %0, 0
|
|
%2 = extractvalue %1 %0, 1
|
|
br i1 %2, label %overflow, label %nooverflow
|
|
|
|
overflow: ; preds = %entry
|
|
tail call void @llvm.trap()
|
|
unreachable
|
|
|
|
nooverflow: ; preds = %entry
|
|
%tmp20 = trunc i128 %1 to i64
|
|
%tmp21 = insertvalue %0 undef, i64 %tmp20, 0
|
|
%tmp22 = lshr i128 %1, 64
|
|
%tmp23 = trunc i128 %tmp22 to i64
|
|
%tmp24 = insertvalue %0 %tmp21, i64 %tmp23, 1
|
|
ret %0 %tmp24
|
|
}
|
|
|
|
declare %1 @llvm.smul.with.overflow.i128(i128, i128) nounwind readnone
|
|
|
|
declare void @llvm.trap() nounwind
|