
We have two forceExpandWideMUL functions. One takes the low and high half of 2 inputs and calculates the low and high half of their product. This does not calculate the full 2x width product. The other signature takes 2 inputs and calculates the low and high half of their full 2x width product. Previously it did this by sign/zero extending the inputs to create the high bits and then calling the other function. We can instead copy the algorithm from the other function and use the Signed flag to determine whether we should do SRA or SRL. This avoids the need to multiply the high part of the inputs and add them to the high half of the result. This improves the generated code for signed multiplication. This should improve the performance of #123262. I don't know yet how close we will get to gcc.
296 lines
8.7 KiB
LLVM
296 lines
8.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=thumbv6m-none-unknown-eabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=ARM
|
|
|
|
declare i4 @llvm.umul.fix.i4 (i4, i4, i32)
|
|
declare i32 @llvm.umul.fix.i32 (i32, i32, i32)
|
|
declare i64 @llvm.umul.fix.i64 (i64, i64, i32)
|
|
|
|
define i32 @func(i32 %x, i32 %y) nounwind {
|
|
; ARM-LABEL: func:
|
|
; ARM: @ %bb.0:
|
|
; ARM-NEXT: .save {r7, lr}
|
|
; ARM-NEXT: push {r7, lr}
|
|
; ARM-NEXT: mov r2, r1
|
|
; ARM-NEXT: movs r1, #0
|
|
; ARM-NEXT: mov r3, r1
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: lsrs r0, r0, #2
|
|
; ARM-NEXT: lsls r1, r1, #30
|
|
; ARM-NEXT: adds r0, r1, r0
|
|
; ARM-NEXT: pop {r7, pc}
|
|
%tmp = call i32 @llvm.umul.fix.i32(i32 %x, i32 %y, i32 2)
|
|
ret i32 %tmp
|
|
}
|
|
|
|
define i64 @func2(i64 %x, i64 %y) nounwind {
|
|
; ARM-LABEL: func2:
|
|
; ARM: @ %bb.0:
|
|
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
|
|
; ARM-NEXT: push {r4, r5, r6, r7, lr}
|
|
; ARM-NEXT: .pad #20
|
|
; ARM-NEXT: sub sp, #20
|
|
; ARM-NEXT: str r3, [sp, #12] @ 4-byte Spill
|
|
; ARM-NEXT: mov r4, r2
|
|
; ARM-NEXT: mov r6, r1
|
|
; ARM-NEXT: str r1, [sp, #8] @ 4-byte Spill
|
|
; ARM-NEXT: mov r5, r0
|
|
; ARM-NEXT: movs r7, #0
|
|
; ARM-NEXT: mov r1, r7
|
|
; ARM-NEXT: mov r3, r7
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: str r0, [sp, #16] @ 4-byte Spill
|
|
; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
|
|
; ARM-NEXT: mov r0, r6
|
|
; ARM-NEXT: mov r1, r7
|
|
; ARM-NEXT: mov r2, r4
|
|
; ARM-NEXT: mov r3, r7
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: mov r4, r1
|
|
; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
|
|
; ARM-NEXT: adds r0, r0, r1
|
|
; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
|
|
; ARM-NEXT: adcs r4, r7
|
|
; ARM-NEXT: mov r0, r5
|
|
; ARM-NEXT: mov r1, r7
|
|
; ARM-NEXT: ldr r6, [sp, #12] @ 4-byte Reload
|
|
; ARM-NEXT: mov r2, r6
|
|
; ARM-NEXT: mov r3, r7
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: mov r5, r1
|
|
; ARM-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
|
|
; ARM-NEXT: adds r0, r0, r1
|
|
; ARM-NEXT: str r0, [sp, #4] @ 4-byte Spill
|
|
; ARM-NEXT: adcs r5, r4
|
|
; ARM-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
|
|
; ARM-NEXT: mov r1, r7
|
|
; ARM-NEXT: mov r2, r6
|
|
; ARM-NEXT: mov r3, r7
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: adds r0, r0, r5
|
|
; ARM-NEXT: lsls r0, r0, #30
|
|
; ARM-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
|
|
; ARM-NEXT: lsrs r1, r2, #2
|
|
; ARM-NEXT: adds r1, r0, r1
|
|
; ARM-NEXT: lsls r0, r2, #30
|
|
; ARM-NEXT: ldr r2, [sp, #16] @ 4-byte Reload
|
|
; ARM-NEXT: lsrs r2, r2, #2
|
|
; ARM-NEXT: adds r0, r0, r2
|
|
; ARM-NEXT: add sp, #20
|
|
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
|
|
%tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 2)
|
|
ret i64 %tmp
|
|
}
|
|
|
|
define i4 @func3(i4 %x, i4 %y) nounwind {
|
|
; ARM-LABEL: func3:
|
|
; ARM: @ %bb.0:
|
|
; ARM-NEXT: .save {r7, lr}
|
|
; ARM-NEXT: push {r7, lr}
|
|
; ARM-NEXT: movs r2, #15
|
|
; ARM-NEXT: ands r0, r2
|
|
; ARM-NEXT: ands r2, r1
|
|
; ARM-NEXT: movs r1, #0
|
|
; ARM-NEXT: mov r3, r1
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: lsrs r0, r0, #2
|
|
; ARM-NEXT: lsls r1, r1, #30
|
|
; ARM-NEXT: adds r0, r1, r0
|
|
; ARM-NEXT: pop {r7, pc}
|
|
%tmp = call i4 @llvm.umul.fix.i4(i4 %x, i4 %y, i32 2)
|
|
ret i4 %tmp
|
|
}
|
|
|
|
;; These result in regular integer multiplication
|
|
define i32 @func4(i32 %x, i32 %y) nounwind {
|
|
; ARM-LABEL: func4:
|
|
; ARM: @ %bb.0:
|
|
; ARM-NEXT: muls r0, r1, r0
|
|
; ARM-NEXT: bx lr
|
|
%tmp = call i32 @llvm.umul.fix.i32(i32 %x, i32 %y, i32 0)
|
|
ret i32 %tmp
|
|
}
|
|
|
|
define i64 @func5(i64 %x, i64 %y) nounwind {
|
|
; ARM-LABEL: func5:
|
|
; ARM: @ %bb.0:
|
|
; ARM-NEXT: .save {r7, lr}
|
|
; ARM-NEXT: push {r7, lr}
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: pop {r7, pc}
|
|
%tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 0)
|
|
ret i64 %tmp
|
|
}
|
|
|
|
define i4 @func6(i4 %x, i4 %y) nounwind {
|
|
; ARM-LABEL: func6:
|
|
; ARM: @ %bb.0:
|
|
; ARM-NEXT: movs r2, #15
|
|
; ARM-NEXT: ands r1, r2
|
|
; ARM-NEXT: ands r0, r2
|
|
; ARM-NEXT: muls r0, r1, r0
|
|
; ARM-NEXT: bx lr
|
|
%tmp = call i4 @llvm.umul.fix.i4(i4 %x, i4 %y, i32 0)
|
|
ret i4 %tmp
|
|
}
|
|
|
|
define i64 @func7(i64 %x, i64 %y) nounwind {
|
|
; ARM-LABEL: func7:
|
|
; ARM: @ %bb.0:
|
|
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
|
|
; ARM-NEXT: push {r4, r5, r6, r7, lr}
|
|
; ARM-NEXT: .pad #12
|
|
; ARM-NEXT: sub sp, #12
|
|
; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
|
|
; ARM-NEXT: mov r4, r2
|
|
; ARM-NEXT: mov r5, r1
|
|
; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
|
|
; ARM-NEXT: mov r7, r0
|
|
; ARM-NEXT: movs r6, #0
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: str r1, [sp] @ 4-byte Spill
|
|
; ARM-NEXT: mov r0, r5
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: mov r2, r4
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: mov r4, r1
|
|
; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
|
|
; ARM-NEXT: adds r0, r0, r1
|
|
; ARM-NEXT: str r0, [sp] @ 4-byte Spill
|
|
; ARM-NEXT: adcs r4, r6
|
|
; ARM-NEXT: mov r0, r7
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
|
|
; ARM-NEXT: mov r2, r7
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: mov r5, r1
|
|
; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
|
|
; ARM-NEXT: adds r0, r0, r1
|
|
; ARM-NEXT: str r0, [sp] @ 4-byte Spill
|
|
; ARM-NEXT: adcs r5, r4
|
|
; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: mov r2, r7
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: adds r1, r0, r5
|
|
; ARM-NEXT: ldr r0, [sp] @ 4-byte Reload
|
|
; ARM-NEXT: add sp, #12
|
|
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
|
|
%tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 32)
|
|
ret i64 %tmp
|
|
}
|
|
|
|
define i64 @func8(i64 %x, i64 %y) nounwind {
|
|
; ARM-LABEL: func8:
|
|
; ARM: @ %bb.0:
|
|
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
|
|
; ARM-NEXT: push {r4, r5, r6, r7, lr}
|
|
; ARM-NEXT: .pad #12
|
|
; ARM-NEXT: sub sp, #12
|
|
; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
|
|
; ARM-NEXT: mov r4, r2
|
|
; ARM-NEXT: mov r5, r1
|
|
; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
|
|
; ARM-NEXT: mov r7, r0
|
|
; ARM-NEXT: movs r6, #0
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: str r1, [sp] @ 4-byte Spill
|
|
; ARM-NEXT: mov r0, r5
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: mov r2, r4
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: mov r4, r1
|
|
; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
|
|
; ARM-NEXT: adds r0, r0, r1
|
|
; ARM-NEXT: str r0, [sp] @ 4-byte Spill
|
|
; ARM-NEXT: adcs r4, r6
|
|
; ARM-NEXT: mov r0, r7
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
|
|
; ARM-NEXT: mov r2, r5
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
|
|
; ARM-NEXT: adds r0, r0, r2
|
|
; ARM-NEXT: str r0, [sp] @ 4-byte Spill
|
|
; ARM-NEXT: adcs r1, r6
|
|
; ARM-NEXT: adds r4, r4, r1
|
|
; ARM-NEXT: mov r7, r6
|
|
; ARM-NEXT: adcs r7, r6
|
|
; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: mov r2, r5
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: adds r0, r0, r4
|
|
; ARM-NEXT: adcs r1, r7
|
|
; ARM-NEXT: lsls r1, r1, #1
|
|
; ARM-NEXT: lsrs r2, r0, #31
|
|
; ARM-NEXT: adds r1, r1, r2
|
|
; ARM-NEXT: lsls r0, r0, #1
|
|
; ARM-NEXT: ldr r2, [sp] @ 4-byte Reload
|
|
; ARM-NEXT: lsrs r2, r2, #31
|
|
; ARM-NEXT: adds r0, r0, r2
|
|
; ARM-NEXT: add sp, #12
|
|
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
|
|
%tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 63)
|
|
ret i64 %tmp
|
|
}
|
|
|
|
define i64 @func9(i64 %x, i64 %y) nounwind {
|
|
; ARM-LABEL: func9:
|
|
; ARM: @ %bb.0:
|
|
; ARM-NEXT: .save {r4, r5, r6, r7, lr}
|
|
; ARM-NEXT: push {r4, r5, r6, r7, lr}
|
|
; ARM-NEXT: .pad #12
|
|
; ARM-NEXT: sub sp, #12
|
|
; ARM-NEXT: str r3, [sp, #8] @ 4-byte Spill
|
|
; ARM-NEXT: mov r4, r2
|
|
; ARM-NEXT: mov r5, r1
|
|
; ARM-NEXT: str r1, [sp, #4] @ 4-byte Spill
|
|
; ARM-NEXT: mov r7, r0
|
|
; ARM-NEXT: movs r6, #0
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: str r1, [sp] @ 4-byte Spill
|
|
; ARM-NEXT: mov r0, r5
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: mov r2, r4
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: mov r4, r1
|
|
; ARM-NEXT: ldr r1, [sp] @ 4-byte Reload
|
|
; ARM-NEXT: adds r5, r0, r1
|
|
; ARM-NEXT: adcs r4, r6
|
|
; ARM-NEXT: mov r0, r7
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
|
|
; ARM-NEXT: mov r2, r7
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: adds r0, r0, r5
|
|
; ARM-NEXT: adcs r1, r6
|
|
; ARM-NEXT: adds r4, r4, r1
|
|
; ARM-NEXT: mov r5, r6
|
|
; ARM-NEXT: adcs r5, r6
|
|
; ARM-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
|
|
; ARM-NEXT: mov r1, r6
|
|
; ARM-NEXT: mov r2, r7
|
|
; ARM-NEXT: mov r3, r6
|
|
; ARM-NEXT: bl __aeabi_lmul
|
|
; ARM-NEXT: adds r0, r0, r4
|
|
; ARM-NEXT: adcs r1, r5
|
|
; ARM-NEXT: add sp, #12
|
|
; ARM-NEXT: pop {r4, r5, r6, r7, pc}
|
|
%tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 64)
|
|
ret i64 %tmp
|
|
}
|