David Green 3313cf4a83
[AArch64][GlobalISel] Add push_mul_through_s/zext (#141551)
This extends the existing push_add_through_zext to handle mul, similar
to performVectorExtCombine in SDAG. This allows muls to be pushed up the
tree of extends, operating on smaller vector types whilst keeping the
result the same (providing there are > 2x bits in the output).

matchExtAddvToUdotAddv needs to be adjusted to make sure it keeps
generating dot instructions from add(ext(mul(ext, ext))).
2025-07-31 07:38:11 +01:00

4804 lines
170 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-BASE
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-DOT
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT
define i32 @addv_v2i32(<2 x i32> %a) {
; CHECK-LABEL: addv_v2i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
ret i32 %arg1
}
define i16 @addv_v4i16(<4 x i16> %a) {
; CHECK-LABEL: addv_v4i16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv h0, v0.4h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
ret i16 %arg1
}
define i32 @add_v4i32_v4i32(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
ret i32 %z
}
define i8 @addv_v8i8(<8 x i8> %a) {
; CHECK-LABEL: addv_v8i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv b0, v0.8b
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
ret i8 %arg1
}
define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i64_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv d0, v0.4s
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv d0, v0.4s
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i32_v4i64_zsext(<4 x i32> %xi) {
; CHECK-LABEL: add_v4i32_v4i64_zsext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%x = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 0, i32 1>
%y = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 2, i32 3>
%xx = zext <2 x i32> %x to <2 x i64>
%yy = sext <2 x i32> %y to <2 x i64>
%zz = add <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %zz)
ret i64 %z
}
define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i32_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv s0, v0.8h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i32_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv s0, v0.8h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i32_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv s0, v0.4h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i32_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv s0, v0.4h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
; CHECK-SD-LABEL: add_v8i16_v8i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv h0, v0.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: uxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
ret i16 %z
}
define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
; CHECK-SD-LABEL: add_v8i16_v8i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.8h
; CHECK-GI-NEXT: mov w0, v0.s[0]
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
; CHECK-SD-LABEL: add_v8i16_v8i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.8h
; CHECK-GI-NEXT: smov x0, v0.s[0]
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
; CHECK-SD-LABEL: add_v4i16_v4i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: mov w0, v0.s[0]
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
; CHECK-SD-LABEL: add_v4i16_v4i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: smov x0, v0.s[0]
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
; CHECK-SD-LABEL: add_v2i16_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v2i16_v2i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: fmov x0, d0
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #48
; CHECK-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.16b, #1
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.16b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
ret i32 %z
}
define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.16b, #1
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.16b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: udot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
; CHECK-SD-LABEL: add_v4i8_v4i32_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: uaddlv s0, v0.4h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i32_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
; CHECK-SD-LABEL: add_v4i8_v4i32_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24
; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i32_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
; CHECK-SD-LABEL: add_v16i8_v16i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlp v0.8h, v0.16b
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
ret i16 %z
}
define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
; CHECK-SD-LABEL: add_v16i8_v16i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlp v0.8h, v0.16b
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: smov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
ret i16 %z
}
define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
; CHECK-SD-LABEL: add_v8i8_v8i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
ret i16 %z
}
define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
; CHECK-SD-LABEL: add_v8i8_v8i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlv h0, v0.8b
; CHECK-SD-NEXT: smov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
ret i16 %z
}
define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
; CHECK-SD-LABEL: add_v16i8_v16i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: addv b0, v0.16b
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv b0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: uxtb w0, w8
; CHECK-GI-NEXT: ret
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
ret i8 %z
}
define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
; CHECK-SD-LABEL: add_v16i8_v16i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and x0, x8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
ret i64 %z
}
define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
; CHECK-SD-LABEL: add_v16i8_v16i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth x0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
; CHECK-SD-LABEL: add_v8i8_v8i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and x0, x8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
; CHECK-SD-LABEL: add_v8i8_v8i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth x0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
; CHECK-SD-LABEL: add_v4i8_v4i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: and x0, x8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
; CHECK-SD-LABEL: add_v4i8_v4i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: sxth x0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
; CHECK-SD-LABEL: add_v2i8_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v2i8_v2i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: fmov x0, d0
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #56
; CHECK-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i64_v2i64(<2 x i64> %x) {
; CHECK-LABEL: add_v2i64_v2i64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
ret i64 %z
}
define i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) {
; CHECK-LABEL: add_v4i32_v4i32_acc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
%r = add i32 %z, %a
ret i32 %r
}
define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv d0, v0.4s
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv d0, v0.4s
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv s0, v0.8h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv s0, v0.8h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv s0, v0.4h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv s0, v0.4h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
; CHECK-SD-LABEL: add_v8i16_v8i16_acc:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w8, w8, w0
; CHECK-SD-NEXT: and w0, w8, #0xffff
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i16_acc:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv h0, v0.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w8, w0, w8, uxth
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
%r = add i16 %z, %a
ret i16 %r
}
define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
; CHECK-SD-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, uxtw
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
; CHECK-SD-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, sxtw
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
; CHECK-SD-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, uxtw
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
; CHECK-SD-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, sxtw
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
; CHECK-SD-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: add x0, x8, x0
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #48
; CHECK-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.16b, #1
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w0, w8, uxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.16b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.16b, #1
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w0, w8, sxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.16b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: udot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w0, w8, uxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w0, w8, sxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
; CHECK-SD-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: uaddlv s0, v0.4h
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w0, w8, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
; CHECK-SD-LABEL: add_v4i8_v4i32_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24
; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i32_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w0, w8, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv h0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w8, w8, w0
; CHECK-NEXT: and w0, w8, #0xffff
; CHECK-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv h0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w8, w8, w0
; CHECK-NEXT: sxth w0, w8
; CHECK-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uaddlv h0, v0.8b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w8, w8, w0
; CHECK-NEXT: and w0, w8, #0xffff
; CHECK-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: saddlv h0, v0.8b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w8, w8, w0
; CHECK-NEXT: sxth w0, w8
; CHECK-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
; CHECK-SD-LABEL: add_v16i8_v16i8_acc:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: addv b0, v0.16b
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w8, w8, w0
; CHECK-SD-NEXT: and w0, w8, #0xff
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i8_acc:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv b0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w8, w0, w8, uxtb
; CHECK-GI-NEXT: and w0, w8, #0xff
; CHECK-GI-NEXT: ret
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
%r = add i8 %z, %a
ret i8 %r
}
define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d1, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add x0, x0, w8, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
; CHECK-SD-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: add x0, x8, x0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: add x0, x8, x0
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #56
; CHECK-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
; CHECK-LABEL: add_v2i64_v2i64_acc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v4i32_v4i32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i32_v4i32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: addv s1, v1.4s
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
%z = add i32 %z1, %z2
ret i32 %z
}
define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv d0, v0.4s
; CHECK-GI-NEXT: uaddlv d1, v1.4s
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i32> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv d0, v0.4s
; CHECK-GI-NEXT: saddlv d1, v1.4s
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i32> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i32> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-SD-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i32> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-NEXT: addv s0, v1.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.8h
; CHECK-GI-NEXT: uaddlv s1, v1.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = zext <8 x i16> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlp v1.4s, v1.8h
; CHECK-SD-NEXT: sadalp v1.4s, v0.8h
; CHECK-SD-NEXT: addv s0, v1.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.8h
; CHECK-GI-NEXT: saddlv s1, v1.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = sext <8 x i16> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: uaddlv s1, v1.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = zext <4 x i16> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: saddlv s1, v1.4h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = sext <4 x i16> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_udot_v8i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: umull v0.8h, v1.8b, v0.8b
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v8i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v8i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: umull v0.8h, v1.8b, v0.8b
; CHECK-GI-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v8i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = zext <8 x i8> %a to <8 x i32>
%1 = zext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_udot_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: umull2 v2.8h, v1.16b, v0.16b
; CHECK-SD-BASE-NEXT: umull v0.8h, v1.8b, v0.8b
; CHECK-SD-BASE-NEXT: uaddl2 v1.4s, v0.8h, v2.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v2.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v16i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v2.4s, v1.16b, v0.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: umull v2.8h, v1.8b, v0.8b
; CHECK-GI-BASE-NEXT: umull2 v0.8h, v1.16b, v0.16b
; CHECK-GI-BASE-NEXT: uaddlv s1, v2.8h
; CHECK-GI-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v16i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.4s, v1.16b, v0.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = zext <16 x i8> %a to <16 x i32>
%1 = zext <16 x i8> %b to <16 x i32>
%2 = mul nuw nsw <16 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
; CHECK-SD-BASE-LABEL: test_udot_v24i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ldr q0, [x0]
; CHECK-SD-BASE-NEXT: ldr q1, [x1]
; CHECK-SD-BASE-NEXT: ldr d2, [x0, #16]
; CHECK-SD-BASE-NEXT: ldr d3, [x1, #16]
; CHECK-SD-BASE-NEXT: umull v2.8h, v3.8b, v2.8b
; CHECK-SD-BASE-NEXT: umull v3.8h, v1.8b, v0.8b
; CHECK-SD-BASE-NEXT: umull2 v0.8h, v1.16b, v0.16b
; CHECK-SD-BASE-NEXT: uaddl2 v1.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: uaddl v2.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: uaddw2 v1.4s, v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: uaddw v0.4s, v2.4s, v0.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v24i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q2, [x0]
; CHECK-SD-DOT-NEXT: ldr q3, [x1]
; CHECK-SD-DOT-NEXT: ldr d4, [x0, #16]
; CHECK-SD-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-SD-DOT-NEXT: udot v1.2s, v5.8b, v4.8b
; CHECK-SD-DOT-NEXT: udot v0.4s, v3.16b, v2.16b
; CHECK-SD-DOT-NEXT: addp v1.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w8, s1
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: add w0, w9, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v24i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ldr q0, [x0]
; CHECK-GI-BASE-NEXT: ldr q1, [x1]
; CHECK-GI-BASE-NEXT: ldr d2, [x0, #16]
; CHECK-GI-BASE-NEXT: ldr d3, [x1, #16]
; CHECK-GI-BASE-NEXT: umull v4.8h, v1.8b, v0.8b
; CHECK-GI-BASE-NEXT: umull2 v0.8h, v1.16b, v0.16b
; CHECK-GI-BASE-NEXT: umull v1.8h, v3.8b, v2.8b
; CHECK-GI-BASE-NEXT: uaddlv s2, v4.8h
; CHECK-GI-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-GI-BASE-NEXT: uaddlv s1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s2
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v24i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q2, [x0]
; CHECK-GI-DOT-NEXT: ldr d3, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr q4, [x1]
; CHECK-GI-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-GI-DOT-NEXT: udot v1.4s, v4.16b, v2.16b
; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v3.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <24 x i8>, ptr %p1
%b = load <24 x i8>, ptr %p2
%0 = zext <24 x i8> %a to <24 x i32>
%1 = zext <24 x i8> %b to <24 x i32>
%2 = mul nuw nsw <24 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
; CHECK-SD-BASE-LABEL: test_udot_v48i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ldp q4, q0, [x0, #16]
; CHECK-SD-BASE-NEXT: ldr q2, [x1, #32]
; CHECK-SD-BASE-NEXT: ldp q1, q5, [x1]
; CHECK-SD-BASE-NEXT: ldr q3, [x0]
; CHECK-SD-BASE-NEXT: umull2 v6.8h, v2.16b, v0.16b
; CHECK-SD-BASE-NEXT: umull v0.8h, v2.8b, v0.8b
; CHECK-SD-BASE-NEXT: umull2 v7.8h, v1.16b, v3.16b
; CHECK-SD-BASE-NEXT: umull v1.8h, v1.8b, v3.8b
; CHECK-SD-BASE-NEXT: umull2 v2.8h, v5.16b, v4.16b
; CHECK-SD-BASE-NEXT: umull v3.8h, v5.8b, v4.8b
; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v7.8h, v6.8h
; CHECK-SD-BASE-NEXT: uaddl2 v5.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: uaddl v6.4s, v7.4h, v6.4h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: uaddw2 v1.4s, v4.4s, v2.8h
; CHECK-SD-BASE-NEXT: uaddw2 v4.4s, v5.4s, v3.8h
; CHECK-SD-BASE-NEXT: uaddw v2.4s, v6.4s, v2.4h
; CHECK-SD-BASE-NEXT: uaddw v0.4s, v0.4s, v3.4h
; CHECK-SD-BASE-NEXT: add v1.4s, v4.4s, v1.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v48i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q1, [x0, #32]
; CHECK-SD-DOT-NEXT: ldr q2, [x1, #32]
; CHECK-SD-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: ldp q3, q1, [x0]
; CHECK-SD-DOT-NEXT: ldp q4, q2, [x1]
; CHECK-SD-DOT-NEXT: udot v0.4s, v4.16b, v3.16b
; CHECK-SD-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v48i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ldp q0, q1, [x0]
; CHECK-GI-BASE-NEXT: ldr q3, [x0, #32]
; CHECK-GI-BASE-NEXT: ldp q2, q4, [x1]
; CHECK-GI-BASE-NEXT: ldr q5, [x1, #32]
; CHECK-GI-BASE-NEXT: umull v7.8h, v5.8b, v3.8b
; CHECK-GI-BASE-NEXT: umull2 v3.8h, v5.16b, v3.16b
; CHECK-GI-BASE-NEXT: umull v6.8h, v2.8b, v0.8b
; CHECK-GI-BASE-NEXT: umull2 v0.8h, v2.16b, v0.16b
; CHECK-GI-BASE-NEXT: umull2 v2.8h, v4.16b, v1.16b
; CHECK-GI-BASE-NEXT: umull v1.8h, v4.8b, v1.8b
; CHECK-GI-BASE-NEXT: uaddlv s5, v7.8h
; CHECK-GI-BASE-NEXT: uaddlv s3, v3.8h
; CHECK-GI-BASE-NEXT: uaddlv s4, v6.8h
; CHECK-GI-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-GI-BASE-NEXT: uaddlv s2, v2.8h
; CHECK-GI-BASE-NEXT: uaddlv s1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w11, s5
; CHECK-GI-BASE-NEXT: fmov w8, s4
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: fmov w10, s2
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w10, w10, w11
; CHECK-GI-BASE-NEXT: fmov w11, s3
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: add w9, w10, w11
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: fmov s0, wzr
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32]
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q17, [x1, #32]
; CHECK-GI-DOT-NEXT: ldp q4, q5, [x0]
; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr
; CHECK-GI-DOT-NEXT: ldp q6, q16, [x1]
; CHECK-GI-DOT-NEXT: udot v2.4s, v17.16b, v7.16b
; CHECK-GI-DOT-NEXT: udot v1.4s, v6.16b, v4.16b
; CHECK-GI-DOT-NEXT: udot v3.4s, v16.16b, v5.16b
; CHECK-GI-DOT-NEXT: mov v0.s[2], wzr
; CHECK-GI-DOT-NEXT: add v1.4s, v1.4s, v3.4s
; CHECK-GI-DOT-NEXT: mov v0.s[3], wzr
; CHECK-GI-DOT-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <48 x i8>, ptr %p1
%b = load <48 x i8>, ptr %p2
%0 = zext <48 x i8> %a to <48 x i32>
%1 = zext <48 x i8> %b to <48 x i32>
%2 = mul nuw nsw <48 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_sdot_v8i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: smull v0.8h, v1.8b, v0.8b
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v8i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v2.2s, v1.8b, v0.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_sdot_v8i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: smull v0.8h, v1.8b, v0.8b
; CHECK-GI-BASE-NEXT: saddlv s0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v8i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.2s, v1.8b, v0.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = sext <8 x i8> %a to <8 x i32>
%1 = sext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_sdot_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: smull2 v2.8h, v1.16b, v0.16b
; CHECK-SD-BASE-NEXT: smull v0.8h, v1.8b, v0.8b
; CHECK-SD-BASE-NEXT: saddl2 v1.4s, v0.8h, v2.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v2.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v16i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v2.4s, v1.16b, v0.16b
; CHECK-SD-DOT-NEXT: addv s0, v2.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_sdot_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: smull v2.8h, v1.8b, v0.8b
; CHECK-GI-BASE-NEXT: smull2 v0.8h, v1.16b, v0.16b
; CHECK-GI-BASE-NEXT: saddlv s1, v2.8h
; CHECK-GI-BASE-NEXT: saddlv s0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v16i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.4s, v1.16b, v0.16b
; CHECK-GI-DOT-NEXT: addv s0, v2.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = sext <16 x i8> %a to <16 x i32>
%1 = sext <16 x i8> %b to <16 x i32>
%2 = mul nuw nsw <16 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
; CHECK-SD-BASE-LABEL: test_sdot_v24i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ldr q0, [x0]
; CHECK-SD-BASE-NEXT: ldr q1, [x1]
; CHECK-SD-BASE-NEXT: ldr d2, [x0, #16]
; CHECK-SD-BASE-NEXT: ldr d3, [x1, #16]
; CHECK-SD-BASE-NEXT: smull v2.8h, v3.8b, v2.8b
; CHECK-SD-BASE-NEXT: smull v3.8h, v1.8b, v0.8b
; CHECK-SD-BASE-NEXT: smull2 v0.8h, v1.16b, v0.16b
; CHECK-SD-BASE-NEXT: saddl2 v1.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: saddl v2.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: saddw2 v1.4s, v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: saddw v0.4s, v2.4s, v0.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v24i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q2, [x0]
; CHECK-SD-DOT-NEXT: ldr q3, [x1]
; CHECK-SD-DOT-NEXT: ldr d4, [x0, #16]
; CHECK-SD-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-SD-DOT-NEXT: sdot v1.2s, v5.8b, v4.8b
; CHECK-SD-DOT-NEXT: sdot v0.4s, v3.16b, v2.16b
; CHECK-SD-DOT-NEXT: addp v1.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w8, s1
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: add w0, w9, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_sdot_v24i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ldr q0, [x0]
; CHECK-GI-BASE-NEXT: ldr q1, [x1]
; CHECK-GI-BASE-NEXT: ldr d2, [x0, #16]
; CHECK-GI-BASE-NEXT: ldr d3, [x1, #16]
; CHECK-GI-BASE-NEXT: smull v4.8h, v1.8b, v0.8b
; CHECK-GI-BASE-NEXT: smull2 v0.8h, v1.16b, v0.16b
; CHECK-GI-BASE-NEXT: smull v1.8h, v3.8b, v2.8b
; CHECK-GI-BASE-NEXT: saddlv s2, v4.8h
; CHECK-GI-BASE-NEXT: saddlv s0, v0.8h
; CHECK-GI-BASE-NEXT: saddlv s1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s2
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q2, [x0]
; CHECK-GI-DOT-NEXT: ldr d3, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr q4, [x1]
; CHECK-GI-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-GI-DOT-NEXT: sdot v1.4s, v4.16b, v2.16b
; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v3.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <24 x i8>, ptr %p1
%b = load <24 x i8>, ptr %p2
%0 = sext <24 x i8> %a to <24 x i32>
%1 = sext <24 x i8> %b to <24 x i32>
%2 = mul nuw nsw <24 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
; CHECK-SD-BASE-LABEL: test_sdot_v48i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ldp q4, q0, [x0, #16]
; CHECK-SD-BASE-NEXT: ldr q2, [x1, #32]
; CHECK-SD-BASE-NEXT: ldp q1, q5, [x1]
; CHECK-SD-BASE-NEXT: ldr q3, [x0]
; CHECK-SD-BASE-NEXT: smull2 v6.8h, v2.16b, v0.16b
; CHECK-SD-BASE-NEXT: smull v0.8h, v2.8b, v0.8b
; CHECK-SD-BASE-NEXT: smull2 v7.8h, v1.16b, v3.16b
; CHECK-SD-BASE-NEXT: smull v1.8h, v1.8b, v3.8b
; CHECK-SD-BASE-NEXT: smull2 v2.8h, v5.16b, v4.16b
; CHECK-SD-BASE-NEXT: smull v3.8h, v5.8b, v4.8b
; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v7.8h, v6.8h
; CHECK-SD-BASE-NEXT: saddl2 v5.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: saddl v6.4s, v7.4h, v6.4h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: saddw2 v1.4s, v4.4s, v2.8h
; CHECK-SD-BASE-NEXT: saddw2 v4.4s, v5.4s, v3.8h
; CHECK-SD-BASE-NEXT: saddw v2.4s, v6.4s, v2.4h
; CHECK-SD-BASE-NEXT: saddw v0.4s, v0.4s, v3.4h
; CHECK-SD-BASE-NEXT: add v1.4s, v4.4s, v1.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v48i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q1, [x0, #32]
; CHECK-SD-DOT-NEXT: ldr q2, [x1, #32]
; CHECK-SD-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: ldp q3, q1, [x0]
; CHECK-SD-DOT-NEXT: ldp q4, q2, [x1]
; CHECK-SD-DOT-NEXT: sdot v0.4s, v4.16b, v3.16b
; CHECK-SD-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ldp q0, q1, [x0]
; CHECK-GI-BASE-NEXT: ldr q3, [x0, #32]
; CHECK-GI-BASE-NEXT: ldp q2, q4, [x1]
; CHECK-GI-BASE-NEXT: ldr q5, [x1, #32]
; CHECK-GI-BASE-NEXT: smull v7.8h, v5.8b, v3.8b
; CHECK-GI-BASE-NEXT: smull2 v3.8h, v5.16b, v3.16b
; CHECK-GI-BASE-NEXT: smull v6.8h, v2.8b, v0.8b
; CHECK-GI-BASE-NEXT: smull2 v0.8h, v2.16b, v0.16b
; CHECK-GI-BASE-NEXT: smull2 v2.8h, v4.16b, v1.16b
; CHECK-GI-BASE-NEXT: smull v1.8h, v4.8b, v1.8b
; CHECK-GI-BASE-NEXT: saddlv s5, v7.8h
; CHECK-GI-BASE-NEXT: saddlv s3, v3.8h
; CHECK-GI-BASE-NEXT: saddlv s4, v6.8h
; CHECK-GI-BASE-NEXT: saddlv s0, v0.8h
; CHECK-GI-BASE-NEXT: saddlv s2, v2.8h
; CHECK-GI-BASE-NEXT: saddlv s1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w11, s5
; CHECK-GI-BASE-NEXT: fmov w8, s4
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: fmov w10, s2
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w10, w10, w11
; CHECK-GI-BASE-NEXT: fmov w11, s3
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: add w9, w10, w11
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: fmov s0, wzr
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32]
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q17, [x1, #32]
; CHECK-GI-DOT-NEXT: ldp q4, q5, [x0]
; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr
; CHECK-GI-DOT-NEXT: ldp q6, q16, [x1]
; CHECK-GI-DOT-NEXT: sdot v2.4s, v17.16b, v7.16b
; CHECK-GI-DOT-NEXT: sdot v1.4s, v6.16b, v4.16b
; CHECK-GI-DOT-NEXT: sdot v3.4s, v16.16b, v5.16b
; CHECK-GI-DOT-NEXT: mov v0.s[2], wzr
; CHECK-GI-DOT-NEXT: add v1.4s, v1.4s, v3.4s
; CHECK-GI-DOT-NEXT: mov v0.s[3], wzr
; CHECK-GI-DOT-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <48 x i8>, ptr %p1
%b = load <48 x i8>, ptr %p2
%0 = sext <48 x i8> %a to <48 x i32>
%1 = sext <48 x i8> %b to <48 x i32>
%2 = mul nuw nsw <48 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
ret i32 %3
}
; Test to ensure that if G_MUL has more than 1 use, it should not be combined to UDOT
define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_udot_v8i8_multi_use:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: umull v0.8h, v1.8b, v0.8b
; CHECK-SD-BASE-NEXT: uaddlv s1, v0.8h
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: fmov w9, s0
; CHECK-SD-BASE-NEXT: fmov w8, s1
; CHECK-SD-BASE-NEXT: add w0, w8, w9
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: umull v3.8h, v1.8b, v0.8b
; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-SD-DOT-NEXT: ushll v0.4s, v3.4h, #0
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: addp v1.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w8, s1
; CHECK-SD-DOT-NEXT: add w0, w8, w9
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v8i8_multi_use:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: umull v0.8h, v1.8b, v0.8b
; CHECK-GI-BASE-NEXT: uaddlv s1, v0.8h
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v8i8_multi_use:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: umull v3.8h, v1.8b, v0.8b
; CHECK-GI-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-GI-DOT-NEXT: ushll v0.4s, v3.4h, #0
; CHECK-GI-DOT-NEXT: fmov w9, s0
; CHECK-GI-DOT-NEXT: addp v1.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w8, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = zext <8 x i8> %a to <8 x i32>
%1 = zext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
%4 = extractelement <8 x i32> %2, i32 0
%5 = add nuw nsw i32 %3, %4
ret i32 %5
}
define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v8i16_v8i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v8i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv h0, v0.8h
; CHECK-GI-NEXT: addv h1, v1.8h
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w9, w8, uxth
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
%z = add i16 %z1, %z2
ret i16 %z
}
define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s1, v1.8h
; CHECK-GI-NEXT: uaddlv s0, v0.8h
; CHECK-GI-NEXT: mov w8, v1.s[0]
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add x0, x8, w9, uxtw
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = zext <8 x i16> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s1, v1.8h
; CHECK-GI-NEXT: saddlv s0, v0.8h
; CHECK-GI-NEXT: smov x8, v1.s[0]
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add x0, x8, w9, sxtw
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = sext <8 x i16> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv s1, v1.4h
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: mov w8, v1.s[0]
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add x0, x8, w9, uxtw
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i16> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv s1, v1.4h
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: smov x8, v1.s[0]
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add x0, x8, w9, sxtw
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i16> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: movi v2.2d, #0x00ffff0000ffff
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v2.2d, #0x0000000000ffff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i16> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-SD-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #48
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #48
; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-SD-NEXT: ssra v0.2d, v1.2d, #48
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: shl v0.2d, v0.2d, #48
; CHECK-GI-NEXT: shl v1.2d, v1.2d, #48
; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #48
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i16> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v0.8h, v2.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v2.4h
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v1.8h, v3.8h
; CHECK-SD-BASE-NEXT: uaddl v1.4s, v1.4h, v3.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.16b
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: and w8, w8, #0xffff
; CHECK-GI-BASE-NEXT: add w0, w8, w9, uxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addv s0, v4.4s
; CHECK-GI-DOT-NEXT: addv s1, v3.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%yy = zext <16 x i8> %y to <16 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v0.8h, v2.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v2.4h
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v1.8h, v3.8h
; CHECK-SD-BASE-NEXT: saddl v1.4s, v1.4h, v3.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h1, v1.16b
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: sxth w8, w8
; CHECK-GI-BASE-NEXT: add w0, w8, w9, sxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addv s0, v4.4s
; CHECK-GI-DOT-NEXT: addv s1, v3.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%yy = sext <16 x i8> %y to <16 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v3.8b, #1
; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v3.8b
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: and w8, w8, #0xffff
; CHECK-GI-BASE-NEXT: add w0, w8, w9, uxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v4.2s, v0.8b, v2.8b
; CHECK-GI-DOT-NEXT: udot v3.2s, v1.8b, v2.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v3.2s, v3.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = zext <8 x i8> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: sadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v3.8b, #1
; CHECK-SD-DOT-NEXT: sdot v2.2s, v1.8b, v3.8b
; CHECK-SD-DOT-NEXT: sdot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: saddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: fmov w8, s1
; CHECK-GI-BASE-NEXT: fmov w9, s0
; CHECK-GI-BASE-NEXT: sxth w8, w8
; CHECK-GI-BASE-NEXT: add w0, w8, w9, sxth
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v4.2s, v0.8b, v2.8b
; CHECK-GI-DOT-NEXT: sdot v3.2s, v1.8b, v2.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v3.2s, v3.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = sext <8 x i8> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-GI-NEXT: uaddlv s1, v1.4h
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: and w8, w8, #0xffff
; CHECK-GI-NEXT: add w0, w8, w9, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = zext <4 x i8> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24
; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24
; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-SD-NEXT: ssra v0.4s, v1.4s, #24
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s1, v1.4h
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: sxth w8, w8
; CHECK-GI-NEXT: add w0, w8, w9, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = sext <4 x i8> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlp v1.8h, v1.16b
; CHECK-SD-NEXT: uadalp v1.8h, v0.16b
; CHECK-SD-NEXT: addv h0, v1.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: uaddlv h1, v1.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%yy = zext <16 x i8> %y to <16 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddlp v1.8h, v1.16b
; CHECK-SD-NEXT: sadalp v1.8h, v0.16b
; CHECK-SD-NEXT: addv h0, v1.8h
; CHECK-SD-NEXT: smov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: saddlv h1, v1.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%yy = sext <16 x i8> %y to <16 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: uaddlv h0, v0.16b
; CHECK-SD-NEXT: umov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: uaddlv h1, v1.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: and w0, w8, #0xffff
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%yy = zext <8 x i8> %y to <8 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: smov w0, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: saddlv h1, v1.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: sxth w0, w8
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%yy = sext <8 x i8> %y to <8 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v16i8_v16i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: add v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: addv b0, v0.16b
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v16i8_v16i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addv b0, v0.16b
; CHECK-GI-NEXT: addv b1, v1.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w9, w8, uxtb
; CHECK-GI-NEXT: and w0, w8, #0xff
; CHECK-GI-NEXT: ret
entry:
%z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
%z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
%z = add i8 %z1, %z2
ret i8 %z
}
define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-SD-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-SD-NEXT: ushll2 v5.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll2 v6.4s, v3.8h, #0
; CHECK-SD-NEXT: ushll2 v7.4s, v1.8h, #0
; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: uaddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-NEXT: uaddl v2.2d, v5.2s, v2.2s
; CHECK-SD-NEXT: uaddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v4.2s
; CHECK-SD-NEXT: uaddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-NEXT: uaddl v6.2d, v7.2s, v6.2s
; CHECK-SD-NEXT: uaddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h1, v1.16b
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: and x8, x8, #0xffff
; CHECK-GI-NEXT: add x0, x8, w9, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%yy = zext <16 x i8> %y to <16 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll2 v3.8h, v1.16b, #0
; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-SD-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-SD-NEXT: sshll2 v5.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: sshll2 v6.4s, v3.8h, #0
; CHECK-SD-NEXT: sshll2 v7.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll v3.4s, v3.4h, #0
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: saddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-NEXT: saddl v2.2d, v5.2s, v2.2s
; CHECK-SD-NEXT: saddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v4.2s
; CHECK-SD-NEXT: saddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-NEXT: saddl v6.2d, v7.2s, v6.2s
; CHECK-SD-NEXT: saddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h1, v1.16b
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: sxth x8, w8
; CHECK-GI-NEXT: add x0, x8, w9, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%yy = sext <16 x i8> %y to <16 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h1, v1.8b
; CHECK-GI-NEXT: uaddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: and x8, x8, #0xffff
; CHECK-GI-NEXT: add x0, x8, w9, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = zext <8 x i8> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h1, v1.8b
; CHECK-GI-NEXT: saddlv h0, v0.8b
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: sxth x8, w8
; CHECK-GI-NEXT: add x0, x8, w9, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = sext <8 x i8> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-NEXT: addp d0, v1.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-GI-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-GI-NEXT: uaddlv s1, v1.4h
; CHECK-GI-NEXT: uaddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: and x8, x8, #0xffff
; CHECK-GI-NEXT: add x0, x8, w9, uxth
; CHECK-GI-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i8> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-SD-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-SD-NEXT: shl v3.2d, v3.2d, #56
; CHECK-SD-NEXT: shl v2.2d, v2.2d, #56
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-SD-NEXT: sshr v2.2d, v2.2d, #56
; CHECK-SD-NEXT: ssra v3.2d, v0.2d, #56
; CHECK-SD-NEXT: ssra v2.2d, v1.2d, #56
; CHECK-SD-NEXT: add v0.2d, v3.2d, v2.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8
; CHECK-GI-NEXT: shl v0.4h, v0.4h, #8
; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8
; CHECK-GI-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-GI-NEXT: saddlv s1, v1.4h
; CHECK-GI-NEXT: saddlv s0, v0.4h
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: sxth x8, w8
; CHECK-GI-NEXT: add x0, x8, w9, sxth
; CHECK-GI-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i8> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-SD-NEXT: movi v2.2d, #0x0000ff000000ff
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: uaddlv d0, v0.4s
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i8> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-SD-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-SD-NEXT: ssra v0.2d, v1.2d, #56
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-GI-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i8> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v3.8h, v3.8b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: saddlp v3.4s, v3.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: sadalp v3.4s, v2.8h
; CHECK-SD-BASE-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v5.8b, #1
; CHECK-SD-DOT-NEXT: movi v6.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v6.2s, v1.8b, v5.8b
; CHECK-SD-DOT-NEXT: sdot v4.2s, v3.8b, v5.8b
; CHECK-SD-DOT-NEXT: udot v6.2s, v0.8b, v5.8b
; CHECK-SD-DOT-NEXT: sdot v4.2s, v2.8b, v5.8b
; CHECK-SD-DOT-NEXT: add v0.2s, v6.2s, v4.2s
; CHECK-SD-DOT-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h3, v3.8b
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.8b
; CHECK-GI-BASE-NEXT: saddlv h2, v2.8b
; CHECK-GI-BASE-NEXT: fmov w8, s3
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: fmov w10, s0
; CHECK-GI-BASE-NEXT: fmov w11, s2
; CHECK-GI-BASE-NEXT: sxth w8, w8
; CHECK-GI-BASE-NEXT: and w9, w9, #0xffff
; CHECK-GI-BASE-NEXT: add w9, w9, w10, uxth
; CHECK-GI-BASE-NEXT: add w8, w8, w11, sxth
; CHECK-GI-BASE-NEXT: add w0, w9, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v4.8b, #1
; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v6.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v5.2s, v0.8b, v4.8b
; CHECK-GI-DOT-NEXT: sdot v6.2s, v3.8b, v4.8b
; CHECK-GI-DOT-NEXT: udot v7.2s, v1.8b, v4.8b
; CHECK-GI-DOT-NEXT: sdot v16.2s, v2.8b, v4.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v5.2s, v5.2s
; CHECK-GI-DOT-NEXT: addp v3.2s, v6.2s, v6.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v7.2s, v7.2s
; CHECK-GI-DOT-NEXT: addp v2.2s, v16.2s, v16.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w11, s3
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: fmov w10, s2
; CHECK-GI-DOT-NEXT: add w8, w8, w9
; CHECK-GI-DOT-NEXT: add w9, w10, w11
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%axx = zext <8 x i8> %ax to <8 x i32>
%az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
%ayy = zext <8 x i8> %ay to <8 x i32>
%az2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %ayy)
%az = add i32 %az1, %az2
%bxx = sext <8 x i8> %bx to <8 x i32>
%bz1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bxx)
%byy = sext <8 x i8> %by to <8 x i32>
%bz2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %byy)
%bz = add i32 %bz1, %bz2
%z = add i32 %az, %bz
ret i32 %z
}
define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
; CHECK-SD-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-NEXT: uaddlp v3.4s, v3.8h
; CHECK-SD-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-NEXT: uadalp v3.4s, v2.8h
; CHECK-SD-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0
; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-NEXT: ushll v5.4s, v1.4h, #0
; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0
; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-GI-NEXT: ushll v7.4s, v3.4h, #0
; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-GI-NEXT: add v0.4s, v4.4s, v0.4s
; CHECK-GI-NEXT: add v1.4s, v5.4s, v1.4s
; CHECK-GI-NEXT: add v2.4s, v6.4s, v2.4s
; CHECK-GI-NEXT: add v3.4s, v7.4s, v3.4s
; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%axx = zext <8 x i16> %ax to <8 x i32>
%s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s1l = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%axs = add <4 x i32> %s1h, %s1l
%ayy = zext <8 x i16> %ay to <8 x i32>
%s2h = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s2l = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%ays = add <4 x i32> %s2h, %s2l
%az = add <4 x i32> %axs, %ays
%bxx = zext <8 x i16> %bx to <8 x i32>
%s3h = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s3l = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%bxs = add <4 x i32> %s3h, %s3l
%byy = zext <8 x i16> %by to <8 x i32>
%s4h = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s4l = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%bys = add <4 x i32> %s4h, %s4l
%bz = add <4 x i32> %bxs, %bys
%z = add <4 x i32> %az, %bz
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z)
ret i32 %z2
}
define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-SD-LABEL: add_pair_v2i64_v2i64:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_pair_v2i64_v2i64:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: addp d1, v1.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: fmov x9, d1
; CHECK-GI-NEXT: add x0, x8, x9
; CHECK-GI-NEXT: ret
entry:
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
%z = add i64 %z1, %z2
ret i64 %z
}
; Irregularly sized vectors
define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
; CHECK-SD-LABEL: add_v24i8_v24i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fmov s0, w0
; CHECK-SD-NEXT: ldr b1, [sp, #64]
; CHECK-SD-NEXT: add x8, sp, #72
; CHECK-SD-NEXT: ldr b2, [sp]
; CHECK-SD-NEXT: add x9, sp, #80
; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8]
; CHECK-SD-NEXT: add x8, sp, #8
; CHECK-SD-NEXT: mov v0.b[1], w1
; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8]
; CHECK-SD-NEXT: add x8, sp, #16
; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-NEXT: add x9, sp, #88
; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8]
; CHECK-SD-NEXT: add x8, sp, #24
; CHECK-SD-NEXT: mov v0.b[2], w2
; CHECK-SD-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-NEXT: add x9, sp, #96
; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8]
; CHECK-SD-NEXT: add x8, sp, #32
; CHECK-SD-NEXT: mov v0.b[3], w3
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: add x9, sp, #104
; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8]
; CHECK-SD-NEXT: add x8, sp, #40
; CHECK-SD-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-NEXT: add x9, sp, #112
; CHECK-SD-NEXT: mov v0.b[4], w4
; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8]
; CHECK-SD-NEXT: add x8, sp, #48
; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-NEXT: add x9, sp, #120
; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8]
; CHECK-SD-NEXT: add x8, sp, #56
; CHECK-SD-NEXT: mov v0.b[5], w5
; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8]
; CHECK-SD-NEXT: mov v0.b[6], w6
; CHECK-SD-NEXT: mov v0.b[7], w7
; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: uaddw v0.8h, v0.8h, v2.8b
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v24i8_v24i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: ldr w8, [sp]
; CHECK-GI-NEXT: ldr w9, [sp, #8]
; CHECK-GI-NEXT: ldr w10, [sp, #72]
; CHECK-GI-NEXT: mov v0.b[1], w1
; CHECK-GI-NEXT: mov v0.b[2], w2
; CHECK-GI-NEXT: mov v0.b[3], w3
; CHECK-GI-NEXT: mov v0.b[4], w4
; CHECK-GI-NEXT: mov v0.b[5], w5
; CHECK-GI-NEXT: mov v0.b[6], w6
; CHECK-GI-NEXT: mov v0.b[7], w7
; CHECK-GI-NEXT: mov v0.b[8], w8
; CHECK-GI-NEXT: ldr w8, [sp, #64]
; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: ldr w8, [sp, #16]
; CHECK-GI-NEXT: mov v0.b[9], w9
; CHECK-GI-NEXT: ldr w9, [sp, #80]
; CHECK-GI-NEXT: mov v1.b[1], w10
; CHECK-GI-NEXT: mov v0.b[10], w8
; CHECK-GI-NEXT: ldr w8, [sp, #24]
; CHECK-GI-NEXT: mov v1.b[2], w9
; CHECK-GI-NEXT: ldr w9, [sp, #88]
; CHECK-GI-NEXT: mov v0.b[11], w8
; CHECK-GI-NEXT: ldr w8, [sp, #32]
; CHECK-GI-NEXT: mov v1.b[3], w9
; CHECK-GI-NEXT: ldr w9, [sp, #96]
; CHECK-GI-NEXT: mov v0.b[12], w8
; CHECK-GI-NEXT: ldr w8, [sp, #40]
; CHECK-GI-NEXT: mov v1.b[4], w9
; CHECK-GI-NEXT: ldr w9, [sp, #104]
; CHECK-GI-NEXT: mov v0.b[13], w8
; CHECK-GI-NEXT: ldr w8, [sp, #48]
; CHECK-GI-NEXT: mov v1.b[5], w9
; CHECK-GI-NEXT: ldr w9, [sp, #112]
; CHECK-GI-NEXT: mov v0.b[14], w8
; CHECK-GI-NEXT: ldr w8, [sp, #56]
; CHECK-GI-NEXT: mov v1.b[6], w9
; CHECK-GI-NEXT: ldr w9, [sp, #120]
; CHECK-GI-NEXT: mov v0.b[15], w8
; CHECK-GI-NEXT: mov v1.b[7], w9
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: uaddlv h1, v1.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <24 x i8> %x to <24 x i16>
%z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx)
ret i16 %z
}
define i16 @add_v32i8_v32i16_zext(<32 x i8> %x) {
; CHECK-SD-LABEL: add_v32i8_v32i16_zext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddl2 v2.8h, v0.16b, v1.16b
; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v32i8_v32i16_zext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddlv h0, v0.16b
; CHECK-GI-NEXT: uaddlv h1, v1.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = zext <32 x i8> %x to <32 x i16>
%z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx)
ret i16 %z
}
define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
; CHECK-SD-LABEL: add_v24i8_v24i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fmov s0, w0
; CHECK-SD-NEXT: ldr b1, [sp, #64]
; CHECK-SD-NEXT: add x8, sp, #72
; CHECK-SD-NEXT: ldr b2, [sp]
; CHECK-SD-NEXT: add x9, sp, #80
; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8]
; CHECK-SD-NEXT: add x8, sp, #8
; CHECK-SD-NEXT: mov v0.b[1], w1
; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8]
; CHECK-SD-NEXT: add x8, sp, #16
; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-NEXT: add x9, sp, #88
; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8]
; CHECK-SD-NEXT: add x8, sp, #24
; CHECK-SD-NEXT: mov v0.b[2], w2
; CHECK-SD-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-NEXT: add x9, sp, #96
; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8]
; CHECK-SD-NEXT: add x8, sp, #32
; CHECK-SD-NEXT: mov v0.b[3], w3
; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-NEXT: add x9, sp, #104
; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8]
; CHECK-SD-NEXT: add x8, sp, #40
; CHECK-SD-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-NEXT: add x9, sp, #112
; CHECK-SD-NEXT: mov v0.b[4], w4
; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8]
; CHECK-SD-NEXT: add x8, sp, #48
; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-NEXT: add x9, sp, #120
; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8]
; CHECK-SD-NEXT: add x8, sp, #56
; CHECK-SD-NEXT: mov v0.b[5], w5
; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8]
; CHECK-SD-NEXT: mov v0.b[6], w6
; CHECK-SD-NEXT: mov v0.b[7], w7
; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: saddw v0.8h, v0.8h, v2.8b
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v24i8_v24i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: ldr w8, [sp]
; CHECK-GI-NEXT: ldr w9, [sp, #8]
; CHECK-GI-NEXT: ldr w10, [sp, #72]
; CHECK-GI-NEXT: mov v0.b[1], w1
; CHECK-GI-NEXT: mov v0.b[2], w2
; CHECK-GI-NEXT: mov v0.b[3], w3
; CHECK-GI-NEXT: mov v0.b[4], w4
; CHECK-GI-NEXT: mov v0.b[5], w5
; CHECK-GI-NEXT: mov v0.b[6], w6
; CHECK-GI-NEXT: mov v0.b[7], w7
; CHECK-GI-NEXT: mov v0.b[8], w8
; CHECK-GI-NEXT: ldr w8, [sp, #64]
; CHECK-GI-NEXT: fmov s1, w8
; CHECK-GI-NEXT: ldr w8, [sp, #16]
; CHECK-GI-NEXT: mov v0.b[9], w9
; CHECK-GI-NEXT: ldr w9, [sp, #80]
; CHECK-GI-NEXT: mov v1.b[1], w10
; CHECK-GI-NEXT: mov v0.b[10], w8
; CHECK-GI-NEXT: ldr w8, [sp, #24]
; CHECK-GI-NEXT: mov v1.b[2], w9
; CHECK-GI-NEXT: ldr w9, [sp, #88]
; CHECK-GI-NEXT: mov v0.b[11], w8
; CHECK-GI-NEXT: ldr w8, [sp, #32]
; CHECK-GI-NEXT: mov v1.b[3], w9
; CHECK-GI-NEXT: ldr w9, [sp, #96]
; CHECK-GI-NEXT: mov v0.b[12], w8
; CHECK-GI-NEXT: ldr w8, [sp, #40]
; CHECK-GI-NEXT: mov v1.b[4], w9
; CHECK-GI-NEXT: ldr w9, [sp, #104]
; CHECK-GI-NEXT: mov v0.b[13], w8
; CHECK-GI-NEXT: ldr w8, [sp, #48]
; CHECK-GI-NEXT: mov v1.b[5], w9
; CHECK-GI-NEXT: ldr w9, [sp, #112]
; CHECK-GI-NEXT: mov v0.b[14], w8
; CHECK-GI-NEXT: ldr w8, [sp, #56]
; CHECK-GI-NEXT: mov v1.b[6], w9
; CHECK-GI-NEXT: ldr w9, [sp, #120]
; CHECK-GI-NEXT: mov v0.b[15], w8
; CHECK-GI-NEXT: mov v1.b[7], w9
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: saddlv h1, v1.8b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <24 x i8> %x to <24 x i16>
%z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx)
ret i16 %z
}
define i16 @add_v32i8_v32i16_sext(<32 x i8> %x) {
; CHECK-SD-LABEL: add_v32i8_v32i16_sext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: saddl2 v2.8h, v0.16b, v1.16b
; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-SD-NEXT: add v0.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: addv h0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: add_v32i8_v32i16_sext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: saddlv h0, v0.16b
; CHECK-GI-NEXT: saddlv h1, v1.16b
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
entry:
%xx = sext <32 x i8> %x to <32 x i16>
%z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx)
ret i16 %z
}
; Irregularly sized vectors and larger extends
define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: fmov s0, w0
; CHECK-SD-BASE-NEXT: ldr b1, [sp, #64]
; CHECK-SD-BASE-NEXT: add x8, sp, #72
; CHECK-SD-BASE-NEXT: ldr b2, [sp]
; CHECK-SD-BASE-NEXT: add x9, sp, #80
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[1], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #8
; CHECK-SD-BASE-NEXT: mov v0.b[1], w1
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #16
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #88
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #24
; CHECK-SD-BASE-NEXT: mov v0.b[2], w2
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #96
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #32
; CHECK-SD-BASE-NEXT: mov v0.b[3], w3
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #104
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #40
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #112
; CHECK-SD-BASE-NEXT: mov v0.b[4], w4
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #48
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #120
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #56
; CHECK-SD-BASE-NEXT: mov v0.b[5], w5
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[7], [x8]
; CHECK-SD-BASE-NEXT: mov v0.b[6], w6
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-SD-BASE-NEXT: mov v0.b[7], w7
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v3.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: uaddw2 v1.4s, v3.4s, v2.8h
; CHECK-SD-BASE-NEXT: uaddw v0.4s, v0.4s, v2.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: fmov s0, w0
; CHECK-SD-DOT-NEXT: mov x8, sp
; CHECK-SD-DOT-NEXT: ldr b1, [sp, #64]
; CHECK-SD-DOT-NEXT: add x9, sp, #72
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[1], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #80
; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: mov v0.b[1], w1
; CHECK-SD-DOT-NEXT: movi v5.8b, #1
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #88
; CHECK-SD-DOT-NEXT: mov v0.b[2], w2
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #96
; CHECK-SD-DOT-NEXT: mov v0.b[3], w3
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #104
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #112
; CHECK-SD-DOT-NEXT: mov v0.b[4], w4
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #120
; CHECK-SD-DOT-NEXT: mov v0.b[5], w5
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-DOT-NEXT: mov v0.b[6], w6
; CHECK-SD-DOT-NEXT: udot v4.2s, v1.8b, v5.8b
; CHECK-SD-DOT-NEXT: mov v0.b[7], w7
; CHECK-SD-DOT-NEXT: addp v1.2s, v4.2s, v4.2s
; CHECK-SD-DOT-NEXT: fmov w9, s1
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[8], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #8
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[9], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #16
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[10], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #24
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[11], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #32
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[12], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #40
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[13], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #48
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[14], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #56
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[15], [x8]
; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w9
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: fmov s0, w0
; CHECK-GI-BASE-NEXT: ldr w8, [sp]
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #8]
; CHECK-GI-BASE-NEXT: ldr w10, [sp, #72]
; CHECK-GI-BASE-NEXT: mov v0.b[1], w1
; CHECK-GI-BASE-NEXT: mov v0.b[2], w2
; CHECK-GI-BASE-NEXT: mov v0.b[3], w3
; CHECK-GI-BASE-NEXT: mov v0.b[4], w4
; CHECK-GI-BASE-NEXT: mov v0.b[5], w5
; CHECK-GI-BASE-NEXT: mov v0.b[6], w6
; CHECK-GI-BASE-NEXT: mov v0.b[7], w7
; CHECK-GI-BASE-NEXT: mov v0.b[8], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #64]
; CHECK-GI-BASE-NEXT: fmov s1, w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #16]
; CHECK-GI-BASE-NEXT: mov v0.b[9], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #80]
; CHECK-GI-BASE-NEXT: mov v1.b[1], w10
; CHECK-GI-BASE-NEXT: mov v0.b[10], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #24]
; CHECK-GI-BASE-NEXT: mov v1.b[2], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88]
; CHECK-GI-BASE-NEXT: mov v0.b[11], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #32]
; CHECK-GI-BASE-NEXT: mov v1.b[3], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96]
; CHECK-GI-BASE-NEXT: mov v0.b[12], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #40]
; CHECK-GI-BASE-NEXT: mov v1.b[4], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104]
; CHECK-GI-BASE-NEXT: mov v0.b[13], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #48]
; CHECK-GI-BASE-NEXT: mov v1.b[5], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112]
; CHECK-GI-BASE-NEXT: mov v0.b[14], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #56]
; CHECK-GI-BASE-NEXT: mov v1.b[6], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120]
; CHECK-GI-BASE-NEXT: mov v0.b[15], w8
; CHECK-GI-BASE-NEXT: mov v1.b[7], w9
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: fmov s0, w0
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #64]
; CHECK-GI-DOT-NEXT: ldr w8, [sp]
; CHECK-GI-DOT-NEXT: ldr w10, [sp, #72]
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.8b, #1
; CHECK-GI-DOT-NEXT: fmov s1, w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #80]
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v0.b[1], w1
; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v1.b[1], w10
; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0]
; CHECK-GI-DOT-NEXT: mov v0.b[2], w2
; CHECK-GI-DOT-NEXT: mov v1.b[2], w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #88]
; CHECK-GI-DOT-NEXT: mov v0.b[3], w3
; CHECK-GI-DOT-NEXT: mov v1.b[3], w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #96]
; CHECK-GI-DOT-NEXT: mov v0.b[4], w4
; CHECK-GI-DOT-NEXT: mov v1.b[4], w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #104]
; CHECK-GI-DOT-NEXT: mov v0.b[5], w5
; CHECK-GI-DOT-NEXT: mov v1.b[5], w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #112]
; CHECK-GI-DOT-NEXT: mov v0.b[6], w6
; CHECK-GI-DOT-NEXT: mov v1.b[6], w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #120]
; CHECK-GI-DOT-NEXT: mov v0.b[7], w7
; CHECK-GI-DOT-NEXT: mov v1.b[7], w9
; CHECK-GI-DOT-NEXT: mov v0.b[8], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8]
; CHECK-GI-DOT-NEXT: fmov d1, d1
; CHECK-GI-DOT-NEXT: mov v0.b[9], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16]
; CHECK-GI-DOT-NEXT: udot v4.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: mov v0.b[10], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24]
; CHECK-GI-DOT-NEXT: mov v0.b[11], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32]
; CHECK-GI-DOT-NEXT: mov v0.b[12], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40]
; CHECK-GI-DOT-NEXT: mov v0.b[13], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #48]
; CHECK-GI-DOT-NEXT: mov v0.b[14], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #56]
; CHECK-GI-DOT-NEXT: mov v0.b[15], w8
; CHECK-GI-DOT-NEXT: udot v5.4s, v0.16b, v3.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <24 x i8> %x to <24 x i32>
%z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx)
ret i32 %z
}
define i32 @add_v32i8_v32i32_zext(<32 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: uaddl v2.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: uaddl2 v5.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v1.4s, v5.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: uaddlv h1, v1.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v4.4s, v3.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <32 x i8> %x to <32 x i32>
%z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx)
ret i32 %z
}
define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: fmov s0, w0
; CHECK-SD-BASE-NEXT: ldr b1, [sp, #64]
; CHECK-SD-BASE-NEXT: add x8, sp, #72
; CHECK-SD-BASE-NEXT: ldr b2, [sp]
; CHECK-SD-BASE-NEXT: add x9, sp, #80
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[1], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #8
; CHECK-SD-BASE-NEXT: mov v0.b[1], w1
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #16
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #88
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #24
; CHECK-SD-BASE-NEXT: mov v0.b[2], w2
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #96
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #32
; CHECK-SD-BASE-NEXT: mov v0.b[3], w3
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #104
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #40
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #112
; CHECK-SD-BASE-NEXT: mov v0.b[4], w4
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #48
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-BASE-NEXT: add x9, sp, #120
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x8]
; CHECK-SD-BASE-NEXT: add x8, sp, #56
; CHECK-SD-BASE-NEXT: mov v0.b[5], w5
; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-BASE-NEXT: ld1 { v2.b }[7], [x8]
; CHECK-SD-BASE-NEXT: mov v0.b[6], w6
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-SD-BASE-NEXT: mov v0.b[7], w7
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v3.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: saddw2 v1.4s, v3.4s, v2.8h
; CHECK-SD-BASE-NEXT: saddw v0.4s, v0.4s, v2.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: fmov s0, w0
; CHECK-SD-DOT-NEXT: mov x8, sp
; CHECK-SD-DOT-NEXT: ldr b1, [sp, #64]
; CHECK-SD-DOT-NEXT: add x9, sp, #72
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[1], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #80
; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: mov v0.b[1], w1
; CHECK-SD-DOT-NEXT: movi v5.8b, #1
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #88
; CHECK-SD-DOT-NEXT: mov v0.b[2], w2
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #96
; CHECK-SD-DOT-NEXT: mov v0.b[3], w3
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #104
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #112
; CHECK-SD-DOT-NEXT: mov v0.b[4], w4
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-SD-DOT-NEXT: add x9, sp, #120
; CHECK-SD-DOT-NEXT: mov v0.b[5], w5
; CHECK-SD-DOT-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-SD-DOT-NEXT: mov v0.b[6], w6
; CHECK-SD-DOT-NEXT: sdot v4.2s, v1.8b, v5.8b
; CHECK-SD-DOT-NEXT: mov v0.b[7], w7
; CHECK-SD-DOT-NEXT: addp v1.2s, v4.2s, v4.2s
; CHECK-SD-DOT-NEXT: fmov w9, s1
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[8], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #8
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[9], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #16
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[10], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #24
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[11], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #32
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[12], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #40
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[13], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #48
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[14], [x8]
; CHECK-SD-DOT-NEXT: add x8, sp, #56
; CHECK-SD-DOT-NEXT: ld1 { v0.b }[15], [x8]
; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w9
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: fmov s0, w0
; CHECK-GI-BASE-NEXT: ldr w8, [sp]
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #8]
; CHECK-GI-BASE-NEXT: ldr w10, [sp, #72]
; CHECK-GI-BASE-NEXT: mov v0.b[1], w1
; CHECK-GI-BASE-NEXT: mov v0.b[2], w2
; CHECK-GI-BASE-NEXT: mov v0.b[3], w3
; CHECK-GI-BASE-NEXT: mov v0.b[4], w4
; CHECK-GI-BASE-NEXT: mov v0.b[5], w5
; CHECK-GI-BASE-NEXT: mov v0.b[6], w6
; CHECK-GI-BASE-NEXT: mov v0.b[7], w7
; CHECK-GI-BASE-NEXT: mov v0.b[8], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #64]
; CHECK-GI-BASE-NEXT: fmov s1, w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #16]
; CHECK-GI-BASE-NEXT: mov v0.b[9], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #80]
; CHECK-GI-BASE-NEXT: mov v1.b[1], w10
; CHECK-GI-BASE-NEXT: mov v0.b[10], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #24]
; CHECK-GI-BASE-NEXT: mov v1.b[2], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #88]
; CHECK-GI-BASE-NEXT: mov v0.b[11], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #32]
; CHECK-GI-BASE-NEXT: mov v1.b[3], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #96]
; CHECK-GI-BASE-NEXT: mov v0.b[12], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #40]
; CHECK-GI-BASE-NEXT: mov v1.b[4], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #104]
; CHECK-GI-BASE-NEXT: mov v0.b[13], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #48]
; CHECK-GI-BASE-NEXT: mov v1.b[5], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #112]
; CHECK-GI-BASE-NEXT: mov v0.b[14], w8
; CHECK-GI-BASE-NEXT: ldr w8, [sp, #56]
; CHECK-GI-BASE-NEXT: mov v1.b[6], w9
; CHECK-GI-BASE-NEXT: ldr w9, [sp, #120]
; CHECK-GI-BASE-NEXT: mov v0.b[15], w8
; CHECK-GI-BASE-NEXT: mov v1.b[7], w9
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: saddlv h1, v1.8b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: fmov s0, w0
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #64]
; CHECK-GI-DOT-NEXT: ldr w8, [sp]
; CHECK-GI-DOT-NEXT: ldr w10, [sp, #72]
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.8b, #1
; CHECK-GI-DOT-NEXT: fmov s1, w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #80]
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v0.b[1], w1
; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v1.b[1], w10
; CHECK-GI-DOT-NEXT: mov v3.d[1], v2.d[0]
; CHECK-GI-DOT-NEXT: mov v0.b[2], w2
; CHECK-GI-DOT-NEXT: mov v1.b[2], w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #88]
; CHECK-GI-DOT-NEXT: mov v0.b[3], w3
; CHECK-GI-DOT-NEXT: mov v1.b[3], w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #96]
; CHECK-GI-DOT-NEXT: mov v0.b[4], w4
; CHECK-GI-DOT-NEXT: mov v1.b[4], w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #104]
; CHECK-GI-DOT-NEXT: mov v0.b[5], w5
; CHECK-GI-DOT-NEXT: mov v1.b[5], w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #112]
; CHECK-GI-DOT-NEXT: mov v0.b[6], w6
; CHECK-GI-DOT-NEXT: mov v1.b[6], w9
; CHECK-GI-DOT-NEXT: ldr w9, [sp, #120]
; CHECK-GI-DOT-NEXT: mov v0.b[7], w7
; CHECK-GI-DOT-NEXT: mov v1.b[7], w9
; CHECK-GI-DOT-NEXT: mov v0.b[8], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #8]
; CHECK-GI-DOT-NEXT: fmov d1, d1
; CHECK-GI-DOT-NEXT: mov v0.b[9], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #16]
; CHECK-GI-DOT-NEXT: sdot v4.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: mov v0.b[10], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #24]
; CHECK-GI-DOT-NEXT: mov v0.b[11], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #32]
; CHECK-GI-DOT-NEXT: mov v0.b[12], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #40]
; CHECK-GI-DOT-NEXT: mov v0.b[13], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #48]
; CHECK-GI-DOT-NEXT: mov v0.b[14], w8
; CHECK-GI-DOT-NEXT: ldr w8, [sp, #56]
; CHECK-GI-DOT-NEXT: mov v0.b[15], w8
; CHECK-GI-DOT-NEXT: sdot v5.4s, v0.16b, v3.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v5.4s, v4.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <24 x i8> %x to <24 x i32>
%z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx)
ret i32 %z
}
define i32 @add_v32i8_v32i32_sext(<32 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: saddl v2.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: saddl2 v5.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v1.4s, v5.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: saddlv h0, v0.16b
; CHECK-GI-BASE-NEXT: saddlv h1, v1.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v4.4s, v3.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <32 x i8> %x to <32 x i32>
%z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx)
ret i32 %z
}
define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
; CHECK-SD-BASE-LABEL: full:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-SD-BASE-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-BASE-NEXT: sxtw x8, w3
; CHECK-SD-BASE-NEXT: sxtw x9, w1
; CHECK-SD-BASE-NEXT: ldr d0, [x0]
; CHECK-SD-BASE-NEXT: ldr d1, [x2]
; CHECK-SD-BASE-NEXT: add x10, x0, x9
; CHECK-SD-BASE-NEXT: add x11, x2, x8
; CHECK-SD-BASE-NEXT: uabdl v0.8h, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: add x10, x10, x9
; CHECK-SD-BASE-NEXT: add x11, x11, x8
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: add x11, x11, x8
; CHECK-SD-BASE-NEXT: uaddlp v0.4s, v0.8h
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: add x10, x10, x9
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: add x11, x11, x8
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: add x10, x10, x9
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: add x11, x11, x8
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: add x10, x10, x9
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: add x11, x11, x8
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: add x10, x10, x9
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11]
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10]
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: ldr d2, [x11, x8]
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: ldr d1, [x10, x9]
; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: full:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ldr d0, [x0]
; CHECK-SD-DOT-NEXT: ldr d1, [x2]
; CHECK-SD-DOT-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-SD-DOT-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-DOT-NEXT: sxtw x8, w3
; CHECK-SD-DOT-NEXT: sxtw x9, w1
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v3.8b, #1
; CHECK-SD-DOT-NEXT: uabd v0.8b, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: add x11, x2, x8
; CHECK-SD-DOT-NEXT: add x10, x0, x9
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: add x11, x11, x8
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: add x10, x10, x9
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: add x10, x10, x9
; CHECK-SD-DOT-NEXT: add x11, x11, x8
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: add x10, x10, x9
; CHECK-SD-DOT-NEXT: add x11, x11, x8
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: add x10, x10, x9
; CHECK-SD-DOT-NEXT: add x11, x11, x8
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: add x10, x10, x9
; CHECK-SD-DOT-NEXT: add x11, x11, x8
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10]
; CHECK-SD-DOT-NEXT: ldr d4, [x11]
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: ldr d1, [x10, x9]
; CHECK-SD-DOT-NEXT: ldr d4, [x11, x8]
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-LABEL: full:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-GI-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-GI-NEXT: sxtw x9, w1
; CHECK-GI-NEXT: sxtw x8, w3
; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: ldr d1, [x2]
; CHECK-GI-NEXT: add x10, x0, x9
; CHECK-GI-NEXT: add x11, x2, x8
; CHECK-GI-NEXT: usubl v0.8h, v0.8b, v1.8b
; CHECK-GI-NEXT: ldr d1, [x10]
; CHECK-GI-NEXT: ldr d2, [x11]
; CHECK-GI-NEXT: add x10, x10, x9
; CHECK-GI-NEXT: add x11, x11, x8
; CHECK-GI-NEXT: usubl v1.8h, v1.8b, v2.8b
; CHECK-GI-NEXT: ldr d3, [x10]
; CHECK-GI-NEXT: ldr d4, [x11]
; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0
; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-NEXT: add x10, x10, x9
; CHECK-GI-NEXT: add x11, x11, x8
; CHECK-GI-NEXT: ldr d2, [x10]
; CHECK-GI-NEXT: add x10, x10, x9
; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0
; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-NEXT: ldr d6, [x11]
; CHECK-GI-NEXT: add x11, x11, x8
; CHECK-GI-NEXT: usubl v3.8h, v3.8b, v4.8b
; CHECK-GI-NEXT: abs v5.4s, v5.4s
; CHECK-GI-NEXT: abs v0.4s, v0.4s
; CHECK-GI-NEXT: ldr d4, [x10]
; CHECK-GI-NEXT: ldr d16, [x11]
; CHECK-GI-NEXT: abs v7.4s, v7.4s
; CHECK-GI-NEXT: abs v1.4s, v1.4s
; CHECK-GI-NEXT: add x10, x10, x9
; CHECK-GI-NEXT: add x11, x11, x8
; CHECK-GI-NEXT: usubl v2.8h, v2.8b, v6.8b
; CHECK-GI-NEXT: ldr d6, [x10]
; CHECK-GI-NEXT: ldr d17, [x11]
; CHECK-GI-NEXT: add x10, x10, x9
; CHECK-GI-NEXT: add x11, x11, x8
; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v16.8b
; CHECK-GI-NEXT: sshll v16.4s, v3.4h, #0
; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0
; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s
; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s
; CHECK-GI-NEXT: ldr d5, [x10]
; CHECK-GI-NEXT: ldr d7, [x11]
; CHECK-GI-NEXT: sshll v18.4s, v2.4h, #0
; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-GI-NEXT: usubl v6.8h, v6.8b, v17.8b
; CHECK-GI-NEXT: ldr d17, [x11, x8]
; CHECK-GI-NEXT: sshll v19.4s, v4.4h, #0
; CHECK-GI-NEXT: usubl v5.8h, v5.8b, v7.8b
; CHECK-GI-NEXT: ldr d7, [x10, x9]
; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0
; CHECK-GI-NEXT: abs v16.4s, v16.4s
; CHECK-GI-NEXT: abs v3.4s, v3.4s
; CHECK-GI-NEXT: abs v18.4s, v18.4s
; CHECK-GI-NEXT: abs v2.4s, v2.4s
; CHECK-GI-NEXT: usubl v7.8h, v7.8b, v17.8b
; CHECK-GI-NEXT: sshll v17.4s, v6.4h, #0
; CHECK-GI-NEXT: sshll2 v6.4s, v6.8h, #0
; CHECK-GI-NEXT: abs v19.4s, v19.4s
; CHECK-GI-NEXT: abs v4.4s, v4.4s
; CHECK-GI-NEXT: add v3.4s, v16.4s, v3.4s
; CHECK-GI-NEXT: sshll v16.4s, v5.4h, #0
; CHECK-GI-NEXT: sshll2 v5.4s, v5.8h, #0
; CHECK-GI-NEXT: add v2.4s, v18.4s, v2.4s
; CHECK-GI-NEXT: abs v17.4s, v17.4s
; CHECK-GI-NEXT: addv s1, v1.4s
; CHECK-GI-NEXT: abs v6.4s, v6.4s
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: add v4.4s, v19.4s, v4.4s
; CHECK-GI-NEXT: addv s3, v3.4s
; CHECK-GI-NEXT: sshll v18.4s, v7.4h, #0
; CHECK-GI-NEXT: sshll2 v7.4s, v7.8h, #0
; CHECK-GI-NEXT: abs v16.4s, v16.4s
; CHECK-GI-NEXT: abs v5.4s, v5.4s
; CHECK-GI-NEXT: fmov w8, s1
; CHECK-GI-NEXT: add v6.4s, v17.4s, v6.4s
; CHECK-GI-NEXT: addv s2, v2.4s
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: addv s4, v4.4s
; CHECK-GI-NEXT: fmov w10, s3
; CHECK-GI-NEXT: abs v18.4s, v18.4s
; CHECK-GI-NEXT: abs v7.4s, v7.4s
; CHECK-GI-NEXT: add v1.4s, v16.4s, v5.4s
; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: addv s3, v6.4s
; CHECK-GI-NEXT: fmov w9, s2
; CHECK-GI-NEXT: add w8, w10, w8
; CHECK-GI-NEXT: fmov w10, s4
; CHECK-GI-NEXT: add v0.4s, v18.4s, v7.4s
; CHECK-GI-NEXT: addv s1, v1.4s
; CHECK-GI-NEXT: add w8, w9, w8
; CHECK-GI-NEXT: fmov w9, s3
; CHECK-GI-NEXT: add w8, w10, w8
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: add w8, w9, w8
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: add w8, w9, w8
; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add w0, w9, w8
; CHECK-GI-NEXT: ret
entry:
%idx.ext8 = sext i32 %s2 to i64
%idx.ext = sext i32 %s1 to i64
%0 = load <8 x i8>, ptr %p1, align 1
%1 = zext <8 x i8> %0 to <8 x i32>
%2 = load <8 x i8>, ptr %p2, align 1
%3 = zext <8 x i8> %2 to <8 x i32>
%4 = sub nsw <8 x i32> %1, %3
%5 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %4, i1 true)
%6 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
%add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext
%add.ptr9 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext8
%7 = load <8 x i8>, ptr %add.ptr, align 1
%8 = zext <8 x i8> %7 to <8 x i32>
%9 = load <8 x i8>, ptr %add.ptr9, align 1
%10 = zext <8 x i8> %9 to <8 x i32>
%11 = sub nsw <8 x i32> %8, %10
%12 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %11, i1 true)
%13 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
%op.rdx.1 = add i32 %13, %6
%add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
%add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8
%14 = load <8 x i8>, ptr %add.ptr.1, align 1
%15 = zext <8 x i8> %14 to <8 x i32>
%16 = load <8 x i8>, ptr %add.ptr9.1, align 1
%17 = zext <8 x i8> %16 to <8 x i32>
%18 = sub nsw <8 x i32> %15, %17
%19 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %18, i1 true)
%20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19)
%op.rdx.2 = add i32 %20, %op.rdx.1
%add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
%add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8
%21 = load <8 x i8>, ptr %add.ptr.2, align 1
%22 = zext <8 x i8> %21 to <8 x i32>
%23 = load <8 x i8>, ptr %add.ptr9.2, align 1
%24 = zext <8 x i8> %23 to <8 x i32>
%25 = sub nsw <8 x i32> %22, %24
%26 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %25, i1 true)
%27 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %26)
%op.rdx.3 = add i32 %27, %op.rdx.2
%add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %idx.ext
%add.ptr9.3 = getelementptr inbounds i8, ptr %add.ptr9.2, i64 %idx.ext8
%28 = load <8 x i8>, ptr %add.ptr.3, align 1
%29 = zext <8 x i8> %28 to <8 x i32>
%30 = load <8 x i8>, ptr %add.ptr9.3, align 1
%31 = zext <8 x i8> %30 to <8 x i32>
%32 = sub nsw <8 x i32> %29, %31
%33 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %32, i1 true)
%34 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %33)
%op.rdx.4 = add i32 %34, %op.rdx.3
%add.ptr.4 = getelementptr inbounds i8, ptr %add.ptr.3, i64 %idx.ext
%add.ptr9.4 = getelementptr inbounds i8, ptr %add.ptr9.3, i64 %idx.ext8
%35 = load <8 x i8>, ptr %add.ptr.4, align 1
%36 = zext <8 x i8> %35 to <8 x i32>
%37 = load <8 x i8>, ptr %add.ptr9.4, align 1
%38 = zext <8 x i8> %37 to <8 x i32>
%39 = sub nsw <8 x i32> %36, %38
%40 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %39, i1 true)
%41 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40)
%op.rdx.5 = add i32 %41, %op.rdx.4
%add.ptr.5 = getelementptr inbounds i8, ptr %add.ptr.4, i64 %idx.ext
%add.ptr9.5 = getelementptr inbounds i8, ptr %add.ptr9.4, i64 %idx.ext8
%42 = load <8 x i8>, ptr %add.ptr.5, align 1
%43 = zext <8 x i8> %42 to <8 x i32>
%44 = load <8 x i8>, ptr %add.ptr9.5, align 1
%45 = zext <8 x i8> %44 to <8 x i32>
%46 = sub nsw <8 x i32> %43, %45
%47 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %46, i1 true)
%48 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47)
%op.rdx.6 = add i32 %48, %op.rdx.5
%add.ptr.6 = getelementptr inbounds i8, ptr %add.ptr.5, i64 %idx.ext
%add.ptr9.6 = getelementptr inbounds i8, ptr %add.ptr9.5, i64 %idx.ext8
%49 = load <8 x i8>, ptr %add.ptr.6, align 1
%50 = zext <8 x i8> %49 to <8 x i32>
%51 = load <8 x i8>, ptr %add.ptr9.6, align 1
%52 = zext <8 x i8> %51 to <8 x i32>
%53 = sub nsw <8 x i32> %50, %52
%54 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %53, i1 true)
%55 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %54)
%op.rdx.7 = add i32 %55, %op.rdx.6
ret i32 %op.rdx.7
}
define i32 @extract_hi_lo(<8 x i16> %a) {
; CHECK-SD-LABEL: extract_hi_lo:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: extract_hi_lo:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%z1 = zext <4 x i16> %e1 to <4 x i32>
%z2 = zext <4 x i16> %e2 to <4 x i32>
%z4 = add <4 x i32> %z1, %z2
%z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4)
ret i32 %z5
}
define i32 @extract_hi_hi(<8 x i16> %a) {
; CHECK-SD-LABEL: extract_hi_hi:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov v0.d[0], v0.d[1]
; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: extract_hi_hi:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddl2 v0.4s, v0.8h, v0.8h
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%z2 = zext <4 x i16> %e2 to <4 x i32>
%z4 = add <4 x i32> %z2, %z2
%z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4)
ret i32 %z5
}
define i32 @extract_lo_lo(<8 x i16> %a) {
; CHECK-SD-LABEL: extract_lo_lo:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov v0.d[1], v0.d[0]
; CHECK-SD-NEXT: uaddlv s0, v0.8h
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: extract_lo_lo:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: uaddl v0.4s, v0.4h, v0.4h
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%z1 = zext <4 x i16> %e1 to <4 x i32>
%z4 = add <4 x i32> %z1, %z1
%z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4)
ret i32 %z5
}
declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
declare i16 @llvm.vector.reduce.add.v24i16(<24 x i16>)
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>)
declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>)
declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)