
This commit is the result of investigation and discussion on WebAssembly/wide-arithmetic#6 where alternatives to the `i64.add128` instruction were discussed but ultimately deferred to a future proposal. In spite of this though I wanted to apply a few changes to the LLVM backend here with `wide-arithmetic` enabled for a few minor changes: * A lowering for the `ISD::UADDO` node is added which uses `add128` where the upper bits of the two operands are constant zeros and the result of the 128-bit addition is the result of the overflowing addition. * The high bits of a `I64_ADD128` node are now flagged as "known zero" if the upper bits of the inputs are also zero, assisting this `UADDO` lowering to ensure the backend knows that the carry result is a 1-bit result. A few tests were then added to showcase various lowerings for various operations that can be done with wide-arithmetic. They don't all optimize super well at this time but I wanted to add them as a reference here regardless to have them on-hand for future evaluations if necessary.
267 lines
8.2 KiB
LLVM
267 lines
8.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mattr=+wide-arithmetic < %s | FileCheck %s
|
|
|
|
target triple = "wasm32-unknown-unknown"
|
|
|
|
define i128 @add_i128(i128 %a, i128 %b) {
|
|
; CHECK-LABEL: add_i128:
|
|
; CHECK: .functype add_i128 (i32, i64, i64, i64, i64) -> ()
|
|
; CHECK-NEXT: # %bb.0:
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: local.get 3
|
|
; CHECK-NEXT: local.get 4
|
|
; CHECK-NEXT: i64.add128
|
|
; CHECK-NEXT: local.set 3
|
|
; CHECK-NEXT: local.set 4
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 3
|
|
; CHECK-NEXT: i64.store 8
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 4
|
|
; CHECK-NEXT: i64.store 0
|
|
; CHECK-NEXT: # fallthrough-return
|
|
%c = add i128 %a, %b
|
|
ret i128 %c
|
|
}
|
|
|
|
define i128 @sub_i128(i128 %a, i128 %b) {
|
|
; CHECK-LABEL: sub_i128:
|
|
; CHECK: .functype sub_i128 (i32, i64, i64, i64, i64) -> ()
|
|
; CHECK-NEXT: # %bb.0:
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: local.get 3
|
|
; CHECK-NEXT: local.get 4
|
|
; CHECK-NEXT: i64.sub128
|
|
; CHECK-NEXT: local.set 3
|
|
; CHECK-NEXT: local.set 4
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 3
|
|
; CHECK-NEXT: i64.store 8
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 4
|
|
; CHECK-NEXT: i64.store 0
|
|
; CHECK-NEXT: # fallthrough-return
|
|
%c = sub i128 %a, %b
|
|
ret i128 %c
|
|
}
|
|
|
|
define i128 @mul_i128(i128 %a, i128 %b) {
|
|
; CHECK-LABEL: mul_i128:
|
|
; CHECK: .functype mul_i128 (i32, i64, i64, i64, i64) -> ()
|
|
; CHECK-NEXT: .local i64
|
|
; CHECK-NEXT: # %bb.0:
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: local.get 3
|
|
; CHECK-NEXT: i64.mul_wide_u
|
|
; CHECK-NEXT: local.set 5
|
|
; CHECK-NEXT: i64.store 0
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 5
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: local.get 4
|
|
; CHECK-NEXT: i64.mul
|
|
; CHECK-NEXT: i64.add
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: local.get 3
|
|
; CHECK-NEXT: i64.mul
|
|
; CHECK-NEXT: i64.add
|
|
; CHECK-NEXT: i64.store 8
|
|
; CHECK-NEXT: # fallthrough-return
|
|
%c = mul i128 %a, %b
|
|
ret i128 %c
|
|
}
|
|
|
|
define i128 @i64_mul_wide_s(i64 %a, i64 %b) {
|
|
; CHECK-LABEL: i64_mul_wide_s:
|
|
; CHECK: .functype i64_mul_wide_s (i32, i64, i64) -> ()
|
|
; CHECK-NEXT: # %bb.0:
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.mul_wide_s
|
|
; CHECK-NEXT: local.set 1
|
|
; CHECK-NEXT: local.set 2
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: i64.store 8
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.store 0
|
|
; CHECK-NEXT: # fallthrough-return
|
|
%a128 = sext i64 %a to i128
|
|
%b128 = sext i64 %b to i128
|
|
%c = mul i128 %a128, %b128
|
|
ret i128 %c
|
|
}
|
|
|
|
define i128 @i64_mul_wide_u(i64 %a, i64 %b) {
|
|
; CHECK-LABEL: i64_mul_wide_u:
|
|
; CHECK: .functype i64_mul_wide_u (i32, i64, i64) -> ()
|
|
; CHECK-NEXT: # %bb.0:
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.mul_wide_u
|
|
; CHECK-NEXT: local.set 1
|
|
; CHECK-NEXT: local.set 2
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: i64.store 8
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.store 0
|
|
; CHECK-NEXT: # fallthrough-return
|
|
%a128 = zext i64 %a to i128
|
|
%b128 = zext i64 %b to i128
|
|
%c = mul i128 %a128, %b128
|
|
ret i128 %c
|
|
}
|
|
|
|
define i64 @mul_i128_only_lo(i128 %a, i128 %b) {
|
|
; CHECK-LABEL: mul_i128_only_lo:
|
|
; CHECK: .functype mul_i128_only_lo (i64, i64, i64, i64) -> (i64)
|
|
; CHECK-NEXT: # %bb.0:
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.mul
|
|
; CHECK-NEXT: # fallthrough-return
|
|
%c = mul i128 %a, %b
|
|
%d = trunc i128 %c to i64
|
|
ret i64 %d
|
|
}
|
|
|
|
declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
|
|
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
|
|
|
|
; This is a codegen test to see the effect of overflowing adds on signed
|
|
; integers with wide-arithmetic enabled. At this time it doesn't actually
|
|
; generate anything differently than without wide-arithmetic but this has also
|
|
; been useful for evaluating the proposal.
|
|
define { i64, i1 } @add_wide_s(i64 %a, i64 %b) {
|
|
; CHECK-LABEL: add_wide_s:
|
|
; CHECK: .functype add_wide_s (i32, i64, i64) -> ()
|
|
; CHECK-NEXT: .local i64
|
|
; CHECK-NEXT: # %bb.0:
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.add
|
|
; CHECK-NEXT: local.tee 3
|
|
; CHECK-NEXT: i64.store 0
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.const 0
|
|
; CHECK-NEXT: i64.lt_s
|
|
; CHECK-NEXT: local.get 3
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: i64.lt_s
|
|
; CHECK-NEXT: i32.xor
|
|
; CHECK-NEXT: i32.store8 8
|
|
; CHECK-NEXT: # fallthrough-return
|
|
%pair = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
|
|
ret { i64, i1 } %pair
|
|
}
|
|
|
|
define { i64, i1 } @add_wide_u(i64 %a, i64 %b) {
|
|
; CHECK-LABEL: add_wide_u:
|
|
; CHECK: .functype add_wide_u (i32, i64, i64) -> ()
|
|
; CHECK-NEXT: # %bb.0:
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: i64.const 0
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.const 0
|
|
; CHECK-NEXT: i64.add128
|
|
; CHECK-NEXT: local.set 1
|
|
; CHECK-NEXT: local.set 2
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: i64.store8 8
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.store 0
|
|
; CHECK-NEXT: # fallthrough-return
|
|
%pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
|
|
ret { i64, i1 } %pair
|
|
}
|
|
|
|
; This is a model of a hypothetical `i64.add_wide3_u` instruction using LLVM
|
|
; intrinsics. In theory this should optimize better (to the equivalent below)
|
|
; but it doesn't currently.
|
|
define { i64, i64 } @add_wide3_u_via_intrinsics(i64 %a, i64 %b, i64 %c) {
|
|
; CHECK-LABEL: add_wide3_u_via_intrinsics:
|
|
; CHECK: .functype add_wide3_u_via_intrinsics (i32, i64, i64, i64) -> ()
|
|
; CHECK-NEXT: # %bb.0:
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: i64.const 0
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.const 0
|
|
; CHECK-NEXT: i64.add128
|
|
; CHECK-NEXT: local.set 2
|
|
; CHECK-NEXT: i64.const 0
|
|
; CHECK-NEXT: local.get 3
|
|
; CHECK-NEXT: i64.const 0
|
|
; CHECK-NEXT: i64.add128
|
|
; CHECK-NEXT: local.set 1
|
|
; CHECK-NEXT: i64.store 0
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: i64.add
|
|
; CHECK-NEXT: i64.store 8
|
|
; CHECK-NEXT: # fallthrough-return
|
|
%pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
|
|
%t0 = extractvalue { i64, i1 } %pair, 0
|
|
%carry1 = extractvalue { i64, i1 } %pair, 1
|
|
|
|
%pair2 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %t0, i64 %c)
|
|
%ret1 = extractvalue { i64, i1 } %pair2, 0
|
|
%carry2 = extractvalue { i64, i1 } %pair2, 1
|
|
|
|
%carry1_64 = zext i1 %carry1 to i64
|
|
%carry2_64 = zext i1 %carry2 to i64
|
|
%ret2 = add i64 %carry1_64, %carry2_64
|
|
|
|
%r0 = insertvalue { i64, i64 } poison, i64 %ret1, 0
|
|
%r1 = insertvalue { i64, i64 } %r0, i64 %ret2, 1
|
|
ret { i64, i64 } %r1
|
|
}
|
|
|
|
; This is a model of a hypothetical `i64.add_wide3_u` instruction using 128-bit
|
|
; integer addition. This optimizes better than the above currently.
|
|
define { i64, i64 } @add_wide3_u_via_i128(i64 %a, i64 %b, i64 %c) {
|
|
; CHECK-LABEL: add_wide3_u_via_i128:
|
|
; CHECK: .functype add_wide3_u_via_i128 (i32, i64, i64, i64) -> ()
|
|
; CHECK-NEXT: # %bb.0:
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: i64.const 0
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.const 0
|
|
; CHECK-NEXT: i64.add128
|
|
; CHECK-NEXT: local.get 3
|
|
; CHECK-NEXT: i64.const 0
|
|
; CHECK-NEXT: i64.add128
|
|
; CHECK-NEXT: local.set 1
|
|
; CHECK-NEXT: local.set 2
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 1
|
|
; CHECK-NEXT: i64.store 8
|
|
; CHECK-NEXT: local.get 0
|
|
; CHECK-NEXT: local.get 2
|
|
; CHECK-NEXT: i64.store 0
|
|
; CHECK-NEXT: # fallthrough-return
|
|
%a128 = zext i64 %a to i128
|
|
%b128 = zext i64 %b to i128
|
|
%c128 = zext i64 %c to i128
|
|
%t0 = add i128 %a128, %b128
|
|
%t1 = add i128 %t0, %c128
|
|
%result = trunc i128 %t1 to i64
|
|
%t2 = lshr i128 %t1, 64
|
|
%carry = trunc i128 %t2 to i64
|
|
|
|
%ret0 = insertvalue { i64, i64 } poison, i64 %result, 0
|
|
%ret1 = insertvalue { i64, i64 } %ret0, i64 %carry, 1
|
|
ret { i64, i64 } %ret1
|
|
}
|