llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
Sander de Smalen 61510b51c3 Revert "[AArch64] Enable subreg liveness tracking by default."
This reverts commit 9c319d5bb40785c969d2af76535ca62448dfafa7.

Some issues were discovered with the bootstrap builds, which
seem like they were caused by this commit. I'm reverting to investigate.
2024-12-12 17:22:15 +00:00

1628 lines
52 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
;
; ADD
;
; Don't use SVE for 64-bit vectors.
define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%res = add <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%res = add <16 x i8> %op1, %op2
ret <16 x i8> %res
}
define void @add_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: add z0.b, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = add <32 x i8> %op1, %op2
store <32 x i8> %res, ptr %a
ret void
}
define void @add_v64i8(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: add_v64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: add z0.b, z0.b, z1.b
; VBITS_GE_256-NEXT: add z1.b, z2.b, z3.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: add_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: add z0.b, z0.b, z1.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, ptr %a
%op2 = load <64 x i8>, ptr %b
%res = add <64 x i8> %op1, %op2
store <64 x i8> %res, ptr %a
ret void
}
define void @add_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: add_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: add z0.b, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, ptr %a
%op2 = load <128 x i8>, ptr %b
%res = add <128 x i8> %op1, %op2
store <128 x i8> %res, ptr %a
ret void
}
define void @add_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: add_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: add z0.b, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
%op2 = load <256 x i8>, ptr %b
%res = add <256 x i8> %op1, %op2
store <256 x i8> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%res = add <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%res = add <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @add_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = add <16 x i16> %op1, %op2
store <16 x i16> %res, ptr %a
ret void
}
define void @add_v32i16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: add_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: add z0.h, z0.h, z1.h
; VBITS_GE_256-NEXT: add z1.h, z2.h, z3.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: add_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: add z0.h, z0.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, ptr %a
%op2 = load <32 x i16>, ptr %b
%res = add <32 x i16> %op1, %op2
store <32 x i16> %res, ptr %a
ret void
}
define void @add_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: add_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, ptr %a
%op2 = load <64 x i16>, ptr %b
%res = add <64 x i16> %op1, %op2
store <64 x i16> %res, ptr %a
ret void
}
define void @add_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: add_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, ptr %a
%op2 = load <128 x i16>, ptr %b
%res = add <128 x i16> %op1, %op2
store <128 x i16> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: add v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%res = add <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%res = add <4 x i32> %op1, %op2
ret <4 x i32> %res
}
define void @add_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = add <8 x i32> %op1, %op2
store <8 x i32> %res, ptr %a
ret void
}
define void @add_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: add_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: add z0.s, z0.s, z1.s
; VBITS_GE_256-NEXT: add z1.s, z2.s, z3.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: add_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: add z0.s, z0.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, ptr %a
%op2 = load <16 x i32>, ptr %b
%res = add <16 x i32> %op1, %op2
store <16 x i32> %res, ptr %a
ret void
}
define void @add_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: add_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, ptr %a
%op2 = load <32 x i32>, ptr %b
%res = add <32 x i32> %op1, %op2
store <32 x i32> %res, ptr %a
ret void
}
define void @add_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: add_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, ptr %a
%op2 = load <64 x i32>, ptr %b
%res = add <64 x i32> %op1, %op2
store <64 x i32> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: add d0, d0, d1
; CHECK-NEXT: ret
%res = add <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%res = add <2 x i64> %op1, %op2
ret <2 x i64> %res
}
define void @add_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: add_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: add z0.d, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = add <4 x i64> %op1, %op2
store <4 x i64> %res, ptr %a
ret void
}
define void @add_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: add_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: add z0.d, z0.d, z1.d
; VBITS_GE_256-NEXT: add z1.d, z2.d, z3.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: add_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: add z0.d, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, ptr %a
%op2 = load <8 x i64>, ptr %b
%res = add <8 x i64> %op1, %op2
store <8 x i64> %res, ptr %a
ret void
}
define void @add_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: add_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: add z0.d, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, ptr %a
%op2 = load <16 x i64>, ptr %b
%res = add <16 x i64> %op1, %op2
store <16 x i64> %res, ptr %a
ret void
}
define void @add_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: add_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: mov x8, #16 // =0x10
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1]
; CHECK-NEXT: add z0.d, z0.d, z1.d
; CHECK-NEXT: add z1.d, z2.d, z3.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; CHECK-NEXT: st1d { z1.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, ptr %a
%op2 = load <32 x i64>, ptr %b
%res = add <32 x i64> %op1, %op2
store <32 x i64> %res, ptr %a
ret void
}
;
; MUL
;
; Don't use SVE for 64-bit vectors.
define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: mul v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%res = mul <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%res = mul <16 x i8> %op1, %op2
ret <16 x i8> %res
}
define void @mul_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = mul <32 x i8> %op1, %op2
store <32 x i8> %res, ptr %a
ret void
}
define void @mul_v64i8(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: mul_v64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: mul z0.b, p0/m, z0.b, z1.b
; VBITS_GE_256-NEXT: movprfx z1, z2
; VBITS_GE_256-NEXT: mul z1.b, p0/m, z1.b, z3.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: mul_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: mul z0.b, p0/m, z0.b, z1.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, ptr %a
%op2 = load <64 x i8>, ptr %b
%res = mul <64 x i8> %op1, %op2
store <64 x i8> %res, ptr %a
ret void
}
define void @mul_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: mul_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, ptr %a
%op2 = load <128 x i8>, ptr %b
%res = mul <128 x i8> %op1, %op2
store <128 x i8> %res, ptr %a
ret void
}
define void @mul_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: mul_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
%op2 = load <256 x i8>, ptr %b
%res = mul <256 x i8> %op1, %op2
store <256 x i8> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%res = mul <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%res = mul <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @mul_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = mul <16 x i16> %op1, %op2
store <16 x i16> %res, ptr %a
ret void
}
define void @mul_v32i16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: mul_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z1.h
; VBITS_GE_256-NEXT: movprfx z1, z2
; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z3.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: mul_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, ptr %a
%op2 = load <32 x i16>, ptr %b
%res = mul <32 x i16> %op1, %op2
store <32 x i16> %res, ptr %a
ret void
}
define void @mul_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: mul_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, ptr %a
%op2 = load <64 x i16>, ptr %b
%res = mul <64 x i16> %op1, %op2
store <64 x i16> %res, ptr %a
ret void
}
define void @mul_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: mul_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, ptr %a
%op2 = load <128 x i16>, ptr %b
%res = mul <128 x i16> %op1, %op2
store <128 x i16> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%res = mul <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%res = mul <4 x i32> %op1, %op2
ret <4 x i32> %res
}
define void @mul_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = mul <8 x i32> %op1, %op2
store <8 x i32> %res, ptr %a
ret void
}
define void @mul_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: mul_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z1.s
; VBITS_GE_256-NEXT: movprfx z1, z2
; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z3.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: mul_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, ptr %a
%op2 = load <16 x i32>, ptr %b
%res = mul <16 x i32> %op1, %op2
store <16 x i32> %res, ptr %a
ret void
}
define void @mul_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: mul_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, ptr %a
%op2 = load <32 x i32>, ptr %b
%res = mul <32 x i32> %op1, %op2
store <32 x i32> %res, ptr %a
ret void
}
define void @mul_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: mul_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, ptr %a
%op2 = load <64 x i32>, ptr %b
%res = mul <64 x i32> %op1, %op2
store <64 x i32> %res, ptr %a
ret void
}
define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
; CHECK-LABEL: mul_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = mul <1 x i64> %op1, %op2
ret <1 x i64> %res
}
define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
; CHECK-LABEL: mul_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = mul <2 x i64> %op1, %op2
ret <2 x i64> %res
}
define void @mul_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: mul_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = mul <4 x i64> %op1, %op2
store <4 x i64> %res, ptr %a
ret void
}
define void @mul_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: mul_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z1.d
; VBITS_GE_256-NEXT: movprfx z1, z2
; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z3.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: mul_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, ptr %a
%op2 = load <8 x i64>, ptr %b
%res = mul <8 x i64> %op1, %op2
store <8 x i64> %res, ptr %a
ret void
}
define void @mul_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: mul_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, ptr %a
%op2 = load <16 x i64>, ptr %b
%res = mul <16 x i64> %op1, %op2
store <16 x i64> %res, ptr %a
ret void
}
define void @mul_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: mul_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, ptr %a
%op2 = load <32 x i64>, ptr %b
%res = mul <32 x i64> %op1, %op2
store <32 x i64> %res, ptr %a
ret void
}
;
; SUB
;
; Don't use SVE for 64-bit vectors.
define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%res = sub <8 x i8> %op1, %op2
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%res = sub <16 x i8> %op1, %op2
ret <16 x i8> %res
}
define void @sub_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: sub z0.b, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b
%res = sub <32 x i8> %op1, %op2
store <32 x i8> %res, ptr %a
ret void
}
define void @sub_v64i8(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: sub_v64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: sub z0.b, z0.b, z1.b
; VBITS_GE_256-NEXT: sub z1.b, z2.b, z3.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sub_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: sub z0.b, z0.b, z1.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, ptr %a
%op2 = load <64 x i8>, ptr %b
%res = sub <64 x i8> %op1, %op2
store <64 x i8> %res, ptr %a
ret void
}
define void @sub_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sub_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: sub z0.b, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, ptr %a
%op2 = load <128 x i8>, ptr %b
%res = sub <128 x i8> %op1, %op2
store <128 x i8> %res, ptr %a
ret void
}
define void @sub_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sub_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: sub z0.b, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
%op2 = load <256 x i8>, ptr %b
%res = sub <256 x i8> %op1, %op2
store <256 x i8> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h
; CHECK-NEXT: ret
%res = sub <4 x i16> %op1, %op2
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
; CHECK-NEXT: ret
%res = sub <8 x i16> %op1, %op2
ret <8 x i16> %res
}
define void @sub_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: sub z0.h, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b
%res = sub <16 x i16> %op1, %op2
store <16 x i16> %res, ptr %a
ret void
}
define void @sub_v32i16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: sub_v32i16:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: sub z0.h, z0.h, z1.h
; VBITS_GE_256-NEXT: sub z1.h, z2.h, z3.h
; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sub_v32i16:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
; VBITS_GE_512-NEXT: sub z0.h, z0.h, z1.h
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <32 x i16>, ptr %a
%op2 = load <32 x i16>, ptr %b
%res = sub <32 x i16> %op1, %op2
store <32 x i16> %res, ptr %a
ret void
}
define void @sub_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sub_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl64
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: sub z0.h, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, ptr %a
%op2 = load <64 x i16>, ptr %b
%res = sub <64 x i16> %op1, %op2
store <64 x i16> %res, ptr %a
ret void
}
define void @sub_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sub_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl128
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: sub z0.h, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, ptr %a
%op2 = load <128 x i16>, ptr %b
%res = sub <128 x i16> %op1, %op2
store <128 x i16> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s
; CHECK-NEXT: ret
%res = sub <2 x i32> %op1, %op2
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%res = sub <4 x i32> %op1, %op2
ret <4 x i32> %res
}
define void @sub_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: sub z0.s, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b
%res = sub <8 x i32> %op1, %op2
store <8 x i32> %res, ptr %a
ret void
}
define void @sub_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: sub_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: sub z0.s, z0.s, z1.s
; VBITS_GE_256-NEXT: sub z1.s, z2.s, z3.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sub_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, ptr %a
%op2 = load <16 x i32>, ptr %b
%res = sub <16 x i32> %op1, %op2
store <16 x i32> %res, ptr %a
ret void
}
define void @sub_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sub_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: sub z0.s, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, ptr %a
%op2 = load <32 x i32>, ptr %b
%res = sub <32 x i32> %op1, %op2
store <32 x i32> %res, ptr %a
ret void
}
define void @sub_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sub_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: sub z0.s, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, ptr %a
%op2 = load <64 x i32>, ptr %b
%res = sub <64 x i32> %op1, %op2
store <64 x i32> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: sub d0, d0, d1
; CHECK-NEXT: ret
%res = sub <1 x i64> %op1, %op2
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%res = sub <2 x i64> %op1, %op2
ret <2 x i64> %res
}
define void @sub_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
; CHECK-LABEL: sub_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: sub z0.d, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b
%res = sub <4 x i64> %op1, %op2
store <4 x i64> %res, ptr %a
ret void
}
define void @sub_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: sub_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: sub z0.d, z0.d, z1.d
; VBITS_GE_256-NEXT: sub z1.d, z2.d, z3.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: sub_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: sub z0.d, z0.d, z1.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, ptr %a
%op2 = load <8 x i64>, ptr %b
%res = sub <8 x i64> %op1, %op2
store <8 x i64> %res, ptr %a
ret void
}
define void @sub_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
; CHECK-LABEL: sub_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: sub z0.d, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, ptr %a
%op2 = load <16 x i64>, ptr %b
%res = sub <16 x i64> %op1, %op2
store <16 x i64> %res, ptr %a
ret void
}
define void @sub_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
; CHECK-LABEL: sub_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: sub z0.d, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, ptr %a
%op2 = load <32 x i64>, ptr %b
%res = sub <32 x i64> %op1, %op2
store <32 x i64> %res, ptr %a
ret void
}
;
; ABS
;
; Don't use SVE for 64-bit vectors.
define <8 x i8> @abs_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: abs v0.8b, v0.8b
; CHECK-NEXT: ret
%res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
ret <8 x i8> %res
}
; Don't use SVE for 128-bit vectors.
define <16 x i8> @abs_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: abs v0.16b, v0.16b
; CHECK-NEXT: ret
%res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
ret <16 x i8> %res
}
define void @abs_v32i8(ptr %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl32
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: abs z0.b, p0/m, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a
%res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
store <32 x i8> %res, ptr %a
ret void
}
define void @abs_v64i8(ptr %a) #0 {
; VBITS_GE_256-LABEL: abs_v64i8:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: abs z0.b, p0/m, z0.b
; VBITS_GE_256-NEXT: abs z1.b, p0/m, z1.b
; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: abs_v64i8:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: abs z0.b, p0/m, z0.b
; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <64 x i8>, ptr %a
%res = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %op1, i1 false)
store <64 x i8> %res, ptr %a
ret void
}
define void @abs_v128i8(ptr %a) vscale_range(8,0) #0 {
; CHECK-LABEL: abs_v128i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl128
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: abs z0.b, p0/m, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i8>, ptr %a
%res = call <128 x i8> @llvm.abs.v128i8(<128 x i8> %op1, i1 false)
store <128 x i8> %res, ptr %a
ret void
}
define void @abs_v256i8(ptr %a) vscale_range(16,0) #0 {
; CHECK-LABEL: abs_v256i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b, vl256
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: abs z0.b, p0/m, z0.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <256 x i8>, ptr %a
%res = call <256 x i8> @llvm.abs.v256i8(<256 x i8> %op1, i1 false)
store <256 x i8> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <4 x i16> @abs_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: abs v0.4h, v0.4h
; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
ret <4 x i16> %res
}
; Don't use SVE for 128-bit vectors.
define <8 x i16> @abs_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: abs v0.8h, v0.8h
; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
ret <8 x i16> %res
}
define void @abs_v16i16(ptr %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v16i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: abs z0.h, p0/m, z0.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i16>, ptr %a
%res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
store <16 x i16> %res, ptr %a
ret void
}
define void @abs_v32i16(ptr %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: mov x8, #16 // =0x10
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
; CHECK-NEXT: abs z0.h, p0/m, z0.h
; CHECK-NEXT: abs z1.h, p0/m, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; CHECK-NEXT: st1h { z1.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i16>, ptr %a
%res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %op1, i1 false)
store <32 x i16> %res, ptr %a
ret void
}
define void @abs_v64i16(ptr %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v64i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: mov x8, #32 // =0x20
; CHECK-NEXT: mov x9, #48 // =0x30
; CHECK-NEXT: mov x10, #16 // =0x10
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0]
; CHECK-NEXT: abs z0.h, p0/m, z0.h
; CHECK-NEXT: abs z1.h, p0/m, z1.h
; CHECK-NEXT: abs z2.h, p0/m, z2.h
; CHECK-NEXT: abs z3.h, p0/m, z3.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; CHECK-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
; CHECK-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
; CHECK-NEXT: st1h { z3.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i16>, ptr %a
%res = call <64 x i16> @llvm.abs.v64i16(<64 x i16> %op1, i1 false)
store <64 x i16> %res, ptr %a
ret void
}
define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v128i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h, vl16
; CHECK-NEXT: mov x8, #96 // =0x60
; CHECK-NEXT: mov x9, #112 // =0x70
; CHECK-NEXT: mov x10, #64 // =0x40
; CHECK-NEXT: mov x11, #80 // =0x50
; CHECK-NEXT: mov x12, #32 // =0x20
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
; CHECK-NEXT: mov x13, #48 // =0x30
; CHECK-NEXT: mov x14, #16 // =0x10
; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
; CHECK-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
; CHECK-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
; CHECK-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
; CHECK-NEXT: abs z0.h, p0/m, z0.h
; CHECK-NEXT: abs z1.h, p0/m, z1.h
; CHECK-NEXT: abs z2.h, p0/m, z2.h
; CHECK-NEXT: abs z3.h, p0/m, z3.h
; CHECK-NEXT: abs z4.h, p0/m, z4.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
; CHECK-NEXT: movprfx z1, z5
; CHECK-NEXT: abs z1.h, p0/m, z5.h
; CHECK-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
; CHECK-NEXT: movprfx z2, z6
; CHECK-NEXT: abs z2.h, p0/m, z6.h
; CHECK-NEXT: abs z0.h, p0/m, z0.h
; CHECK-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1]
; CHECK-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1]
; CHECK-NEXT: st1h { z1.h }, p0, [x0, x13, lsl #1]
; CHECK-NEXT: st1h { z2.h }, p0, [x0, x14, lsl #1]
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <128 x i16>, ptr %a
%res = call <128 x i16> @llvm.abs.v128i16(<128 x i16> %op1, i1 false)
store <128 x i16> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <2 x i32> @abs_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: abs v0.2s, v0.2s
; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
ret <2 x i32> %res
}
; Don't use SVE for 128-bit vectors.
define <4 x i32> @abs_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: abs v0.4s, v0.4s
; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
ret <4 x i32> %res
}
define void @abs_v8i32(ptr %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: abs z0.s, p0/m, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a
%res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
store <8 x i32> %res, ptr %a
ret void
}
define void @abs_v16i32(ptr %a) #0 {
; VBITS_GE_256-LABEL: abs_v16i32:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: abs z0.s, p0/m, z0.s
; VBITS_GE_256-NEXT: abs z1.s, p0/m, z1.s
; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: abs_v16i32:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: abs z0.s, p0/m, z0.s
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <16 x i32>, ptr %a
%res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false)
store <16 x i32> %res, ptr %a
ret void
}
define void @abs_v32i32(ptr %a) vscale_range(8,0) #0 {
; CHECK-LABEL: abs_v32i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl32
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: abs z0.s, p0/m, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i32>, ptr %a
%res = call <32 x i32> @llvm.abs.v32i32(<32 x i32> %op1, i1 false)
store <32 x i32> %res, ptr %a
ret void
}
define void @abs_v64i32(ptr %a) vscale_range(16,0) #0 {
; CHECK-LABEL: abs_v64i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl64
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: abs z0.s, p0/m, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i32>, ptr %a
%res = call <64 x i32> @llvm.abs.v64i32(<64 x i32> %op1, i1 false)
store <64 x i32> %res, ptr %a
ret void
}
; Don't use SVE for 64-bit vectors.
define <1 x i64> @abs_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: abs d0, d0
; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
ret <1 x i64> %res
}
; Don't use SVE for 128-bit vectors.
define <2 x i64> @abs_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: abs v0.2d, v0.2d
; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
ret <2 x i64> %res
}
define void @abs_v4i64(ptr %a) vscale_range(2,0) #0 {
; CHECK-LABEL: abs_v4i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: abs z0.d, p0/m, z0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <4 x i64>, ptr %a
%res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
store <4 x i64> %res, ptr %a
ret void
}
define void @abs_v8i64(ptr %a) #0 {
; VBITS_GE_256-LABEL: abs_v8i64:
; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: abs z0.d, p0/m, z0.d
; VBITS_GE_256-NEXT: abs z1.d, p0/m, z1.d
; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret
;
; VBITS_GE_512-LABEL: abs_v8i64:
; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: abs z0.d, p0/m, z0.d
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
; VBITS_GE_512-NEXT: ret
%op1 = load <8 x i64>, ptr %a
%res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false)
store <8 x i64> %res, ptr %a
ret void
}
define void @abs_v16i64(ptr %a) vscale_range(8,0) #0 {
; CHECK-LABEL: abs_v16i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl16
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: abs z0.d, p0/m, z0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i64>, ptr %a
%res = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %op1, i1 false)
store <16 x i64> %res, ptr %a
ret void
}
define void @abs_v32i64(ptr %a) vscale_range(16,0) #0 {
; CHECK-LABEL: abs_v32i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: abs z0.d, p0/m, z0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i64>, ptr %a
%res = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %op1, i1 false)
store <32 x i64> %res, ptr %a
ret void
}
declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1)
declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1)
declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1)
declare <128 x i8> @llvm.abs.v128i8(<128 x i8>, i1)
declare <256 x i8> @llvm.abs.v256i8(<256 x i8>, i1)
declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1)
declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)
declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1)
declare <64 x i16> @llvm.abs.v64i16(<64 x i16>, i1)
declare <128 x i16> @llvm.abs.v128i16(<128 x i16>, i1)
declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)
declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)
declare <32 x i32> @llvm.abs.v32i32(<32 x i32>, i1)
declare <64 x i32> @llvm.abs.v64i32(<64 x i32>, i1)
declare <1 x i64> @llvm.abs.v1i64(<1 x i64>, i1)
declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1)
attributes #0 = { "target-features"="+sve" }