llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll
Gaëtan Bossu 9828745661
[AArch64][ISel] Select constructive EXT_ZZI pseudo instruction (#152554)
The patch adds patterns to select the EXT_ZZI_CONSTRUCTIVE pseudo
instead of the EXT_ZZI destructive instruction for vector_splice. This
only works when the two inputs to vector_splice are identical.

Given that registers aren't tied anymore, this gives the register
allocator more freedom and a lot of MOVs get replaced with MOVPRFX.

In some cases however, we could have just chosen the same input and
output register, but regalloc preferred not to. This means we end up
with some test cases now having more instructions: there is now a
MOVPRFX while no MOV was previously needed.
2025-08-15 14:30:24 +01:00

1174 lines
43 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 | FileCheck %s
define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) nounwind {
; CHECK-LABEL: llrint_v1i64_v1f16:
; CHECK: // %bb.0:
; CHECK-NEXT: frintx h0, h0
; CHECK-NEXT: fcvtzs x8, h0
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: ret
%a = call <1 x i64> @llvm.llrint.v1i64.v1f16(<1 x half> %x)
ret <1 x i64> %a
}
declare <1 x i64> @llvm.llrint.v1i64.v1f16(<1 x half>)
define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) nounwind {
; CHECK-LABEL: llrint_v1i64_v2f16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov h1, v0.h[1]
; CHECK-NEXT: frintx h0, h0
; CHECK-NEXT: frintx h1, h1
; CHECK-NEXT: fcvtzs x8, h0
; CHECK-NEXT: fcvtzs x9, h1
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: mov v0.d[1], x9
; CHECK-NEXT: ret
%a = call <2 x i64> @llvm.llrint.v2i64.v2f16(<2 x half> %x)
ret <2 x i64> %a
}
declare <2 x i64> @llvm.llrint.v2i64.v2f16(<2 x half>)
define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) nounwind {
; CHECK-LABEL: llrint_v4i64_v4f16:
; CHECK: // %bb.0:
; CHECK-NEXT: frintx v0.4h, v0.4h
; CHECK-NEXT: mov h1, v0.h[2]
; CHECK-NEXT: mov h2, v0.h[3]
; CHECK-NEXT: mov h3, v0.h[1]
; CHECK-NEXT: fcvtzs x9, h0
; CHECK-NEXT: fcvtzs x8, h1
; CHECK-NEXT: fcvtzs x10, h2
; CHECK-NEXT: fcvtzs x11, h3
; CHECK-NEXT: fmov d0, x9
; CHECK-NEXT: fmov d1, x8
; CHECK-NEXT: mov v0.d[1], x11
; CHECK-NEXT: mov v1.d[1], x10
; CHECK-NEXT: ret
%a = call <4 x i64> @llvm.llrint.v4i64.v4f16(<4 x half> %x)
ret <4 x i64> %a
}
declare <4 x i64> @llvm.llrint.v4i64.v4f16(<4 x half>)
define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) nounwind {
; CHECK-LABEL: llrint_v8i64_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: frintx v0.4h, v0.4h
; CHECK-NEXT: frintx v1.4h, v1.4h
; CHECK-NEXT: mov h4, v0.h[2]
; CHECK-NEXT: mov h2, v0.h[1]
; CHECK-NEXT: mov h7, v0.h[3]
; CHECK-NEXT: fcvtzs x8, h0
; CHECK-NEXT: mov h3, v1.h[2]
; CHECK-NEXT: mov h5, v1.h[3]
; CHECK-NEXT: mov h6, v1.h[1]
; CHECK-NEXT: fcvtzs x11, h1
; CHECK-NEXT: fcvtzs x12, h4
; CHECK-NEXT: fcvtzs x9, h2
; CHECK-NEXT: fcvtzs x15, h7
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: fcvtzs x10, h3
; CHECK-NEXT: fcvtzs x13, h5
; CHECK-NEXT: fcvtzs x14, h6
; CHECK-NEXT: fmov d1, x12
; CHECK-NEXT: fmov d2, x11
; CHECK-NEXT: mov v0.d[1], x9
; CHECK-NEXT: fmov d3, x10
; CHECK-NEXT: mov v1.d[1], x15
; CHECK-NEXT: mov v2.d[1], x14
; CHECK-NEXT: mov v3.d[1], x13
; CHECK-NEXT: ret
%a = call <8 x i64> @llvm.llrint.v8i64.v8f16(<8 x half> %x)
ret <8 x i64> %a
}
declare <8 x i64> @llvm.llrint.v8i64.v8f16(<8 x half>)
define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) nounwind {
; CHECK-LABEL: llrint_v16i64_v16f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECK-NEXT: frintx v1.4h, v1.4h
; CHECK-NEXT: frintx v3.4h, v0.4h
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: frintx v2.4h, v2.4h
; CHECK-NEXT: mov h4, v1.h[2]
; CHECK-NEXT: mov h5, v3.h[2]
; CHECK-NEXT: frintx v0.4h, v0.4h
; CHECK-NEXT: mov h6, v3.h[1]
; CHECK-NEXT: fcvtzs x9, h3
; CHECK-NEXT: mov h16, v1.h[1]
; CHECK-NEXT: fcvtzs x12, h1
; CHECK-NEXT: mov h3, v3.h[3]
; CHECK-NEXT: mov h17, v1.h[3]
; CHECK-NEXT: mov h7, v2.h[3]
; CHECK-NEXT: fcvtzs x8, h4
; CHECK-NEXT: fcvtzs x10, h5
; CHECK-NEXT: mov h4, v2.h[2]
; CHECK-NEXT: mov h5, v0.h[2]
; CHECK-NEXT: fcvtzs x11, h6
; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: fcvtzs x15, h2
; CHECK-NEXT: mov h2, v2.h[1]
; CHECK-NEXT: fcvtzs x14, h0
; CHECK-NEXT: fcvtzs x17, h3
; CHECK-NEXT: fcvtzs x0, h17
; CHECK-NEXT: fcvtzs x13, h7
; CHECK-NEXT: mov h7, v0.h[1]
; CHECK-NEXT: fmov d0, x9
; CHECK-NEXT: fcvtzs x16, h4
; CHECK-NEXT: fcvtzs x9, h5
; CHECK-NEXT: fmov d4, x12
; CHECK-NEXT: fcvtzs x12, h16
; CHECK-NEXT: fmov d1, x10
; CHECK-NEXT: fcvtzs x10, h6
; CHECK-NEXT: fmov d5, x8
; CHECK-NEXT: fcvtzs x8, h2
; CHECK-NEXT: fmov d2, x14
; CHECK-NEXT: fcvtzs x18, h7
; CHECK-NEXT: fmov d6, x15
; CHECK-NEXT: mov v0.d[1], x11
; CHECK-NEXT: fmov d3, x9
; CHECK-NEXT: fmov d7, x16
; CHECK-NEXT: mov v1.d[1], x17
; CHECK-NEXT: mov v4.d[1], x12
; CHECK-NEXT: mov v5.d[1], x0
; CHECK-NEXT: mov v6.d[1], x8
; CHECK-NEXT: mov v2.d[1], x18
; CHECK-NEXT: mov v3.d[1], x10
; CHECK-NEXT: mov v7.d[1], x13
; CHECK-NEXT: ret
%a = call <16 x i64> @llvm.llrint.v16i64.v16f16(<16 x half> %x)
ret <16 x i64> %a
}
declare <16 x i64> @llvm.llrint.v16i64.v16f16(<16 x half>)
define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) nounwind {
; CHECK-LABEL: llrint_v32i64_v32f16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT: sub x9, sp, #272
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK-NEXT: frintx v5.4h, v0.4h
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8
; CHECK-NEXT: ext v17.16b, v2.16b, v2.16b, #8
; CHECK-NEXT: frintx v1.4h, v1.4h
; CHECK-NEXT: frintx v2.4h, v2.4h
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: mov h6, v5.h[3]
; CHECK-NEXT: frintx v0.4h, v0.4h
; CHECK-NEXT: mov h7, v5.h[2]
; CHECK-NEXT: mov h16, v5.h[1]
; CHECK-NEXT: frintx v4.4h, v4.4h
; CHECK-NEXT: fcvtzs x12, h5
; CHECK-NEXT: ext v5.16b, v3.16b, v3.16b, #8
; CHECK-NEXT: frintx v17.4h, v17.4h
; CHECK-NEXT: frintx v3.4h, v3.4h
; CHECK-NEXT: fcvtzs x9, h6
; CHECK-NEXT: mov h6, v0.h[3]
; CHECK-NEXT: fcvtzs x10, h7
; CHECK-NEXT: mov h7, v0.h[2]
; CHECK-NEXT: fcvtzs x11, h16
; CHECK-NEXT: mov h16, v0.h[1]
; CHECK-NEXT: fcvtzs x13, h6
; CHECK-NEXT: mov h6, v4.h[3]
; CHECK-NEXT: stp x10, x9, [sp, #48]
; CHECK-NEXT: fcvtzs x9, h7
; CHECK-NEXT: mov h7, v4.h[2]
; CHECK-NEXT: fcvtzs x10, h16
; CHECK-NEXT: mov h16, v4.h[1]
; CHECK-NEXT: stp x12, x11, [sp, #32]
; CHECK-NEXT: fcvtzs x11, h0
; CHECK-NEXT: frintx v0.4h, v5.4h
; CHECK-NEXT: mov h5, v17.h[3]
; CHECK-NEXT: fcvtzs x12, h6
; CHECK-NEXT: mov h6, v17.h[2]
; CHECK-NEXT: stp x9, x13, [sp, #16]
; CHECK-NEXT: fcvtzs x13, h7
; CHECK-NEXT: mov h7, v17.h[1]
; CHECK-NEXT: fcvtzs x9, h16
; CHECK-NEXT: stp x11, x10, [sp]
; CHECK-NEXT: fcvtzs x10, h4
; CHECK-NEXT: fcvtzs x11, h5
; CHECK-NEXT: mov h4, v0.h[3]
; CHECK-NEXT: mov h5, v0.h[2]
; CHECK-NEXT: stp x13, x12, [sp, #80]
; CHECK-NEXT: fcvtzs x12, h6
; CHECK-NEXT: fcvtzs x13, h7
; CHECK-NEXT: mov h6, v0.h[1]
; CHECK-NEXT: stp x10, x9, [sp, #64]
; CHECK-NEXT: fcvtzs x9, h17
; CHECK-NEXT: mov h7, v1.h[3]
; CHECK-NEXT: fcvtzs x10, h4
; CHECK-NEXT: mov h4, v1.h[2]
; CHECK-NEXT: stp x12, x11, [sp, #144]
; CHECK-NEXT: fcvtzs x11, h5
; CHECK-NEXT: mov h5, v1.h[1]
; CHECK-NEXT: fcvtzs x12, h6
; CHECK-NEXT: stp x9, x13, [sp, #128]
; CHECK-NEXT: fcvtzs x9, h0
; CHECK-NEXT: fcvtzs x13, h7
; CHECK-NEXT: mov h0, v2.h[3]
; CHECK-NEXT: stp x11, x10, [sp, #208]
; CHECK-NEXT: fcvtzs x10, h4
; CHECK-NEXT: mov h4, v2.h[2]
; CHECK-NEXT: fcvtzs x11, h5
; CHECK-NEXT: mov h5, v2.h[1]
; CHECK-NEXT: stp x9, x12, [sp, #192]
; CHECK-NEXT: fcvtzs x9, h1
; CHECK-NEXT: fcvtzs x12, h0
; CHECK-NEXT: mov h0, v3.h[3]
; CHECK-NEXT: mov h1, v3.h[2]
; CHECK-NEXT: stp x10, x13, [sp, #112]
; CHECK-NEXT: fcvtzs x10, h4
; CHECK-NEXT: mov h4, v3.h[1]
; CHECK-NEXT: fcvtzs x13, h5
; CHECK-NEXT: stp x9, x11, [sp, #96]
; CHECK-NEXT: fcvtzs x9, h2
; CHECK-NEXT: fcvtzs x11, h0
; CHECK-NEXT: stp x10, x12, [sp, #176]
; CHECK-NEXT: fcvtzs x10, h1
; CHECK-NEXT: fcvtzs x12, h4
; CHECK-NEXT: stp x9, x13, [sp, #160]
; CHECK-NEXT: fcvtzs x9, h3
; CHECK-NEXT: stp x10, x11, [sp, #240]
; CHECK-NEXT: add x10, sp, #64
; CHECK-NEXT: stp x9, x12, [sp, #224]
; CHECK-NEXT: add x9, sp, #32
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x9]
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x10]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9]
; CHECK-NEXT: add x9, sp, #224
; CHECK-NEXT: add x10, sp, #128
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x9]
; CHECK-NEXT: add x9, sp, #160
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x10]
; CHECK-NEXT: add x10, sp, #96
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x9]
; CHECK-NEXT: add x9, sp, #192
; CHECK-NEXT: ld1d { z6.d }, p0/z, [x10]
; CHECK-NEXT: mov x10, #24 // =0x18
; CHECK-NEXT: ld1d { z7.d }, p0/z, [x9]
; CHECK-NEXT: mov x9, #16 // =0x10
; CHECK-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3]
; CHECK-NEXT: st1d { z5.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #8 // =0x8
; CHECK-NEXT: st1d { z6.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #28 // =0x1c
; CHECK-NEXT: st1d { z7.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #20 // =0x14
; CHECK-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #12 // =0xc
; CHECK-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #4 // =0x4
; CHECK-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: st1d { z0.d }, p0, [x8]
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%a = call <32 x i64> @llvm.llrint.v32i64.v32f16(<32 x half> %x)
ret <32 x i64> %a
}
declare <32 x i64> @llvm.llrint.v32i64.v32f16(<32 x half>)
define <1 x i64> @llrint_v1i64_v1f32(<1 x float> %x) nounwind {
; CHECK-LABEL: llrint_v1i64_v1f32:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: frintx s0, s0
; CHECK-NEXT: fcvtzs x8, s0
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: ret
%a = call <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float> %x)
ret <1 x i64> %a
}
declare <1 x i64> @llvm.llrint.v1i64.v1f32(<1 x float>)
define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) nounwind {
; CHECK-LABEL: llrint_v2i64_v2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: frintx v0.2s, v0.2s
; CHECK-NEXT: fcvtl v0.2d, v0.2s
; CHECK-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-NEXT: ret
%a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
ret <2 x i64> %a
}
declare <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float>)
define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) nounwind {
; CHECK-LABEL: llrint_v4i64_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: frintx v0.4s, v0.4s
; CHECK-NEXT: mov s1, v0.s[2]
; CHECK-NEXT: mov s2, v0.s[3]
; CHECK-NEXT: mov s3, v0.s[1]
; CHECK-NEXT: fcvtzs x9, s0
; CHECK-NEXT: fcvtzs x8, s1
; CHECK-NEXT: fcvtzs x10, s2
; CHECK-NEXT: fcvtzs x11, s3
; CHECK-NEXT: fmov d0, x9
; CHECK-NEXT: fmov d1, x8
; CHECK-NEXT: mov v0.d[1], x11
; CHECK-NEXT: mov v1.d[1], x10
; CHECK-NEXT: ret
%a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
ret <4 x i64> %a
}
declare <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float>)
define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) nounwind {
; CHECK-LABEL: llrint_v8i64_v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: frintx v0.4s, v0.4s
; CHECK-NEXT: frintx v1.4s, v1.4s
; CHECK-NEXT: mov s3, v1.s[2]
; CHECK-NEXT: mov s4, v0.s[2]
; CHECK-NEXT: mov s2, v0.s[1]
; CHECK-NEXT: mov s5, v1.s[3]
; CHECK-NEXT: mov s6, v1.s[1]
; CHECK-NEXT: mov s7, v0.s[3]
; CHECK-NEXT: fcvtzs x8, s0
; CHECK-NEXT: fcvtzs x10, s1
; CHECK-NEXT: fcvtzs x11, s3
; CHECK-NEXT: fcvtzs x12, s4
; CHECK-NEXT: fcvtzs x9, s2
; CHECK-NEXT: fcvtzs x13, s5
; CHECK-NEXT: fcvtzs x14, s6
; CHECK-NEXT: fcvtzs x15, s7
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: fmov d2, x10
; CHECK-NEXT: fmov d1, x12
; CHECK-NEXT: fmov d3, x11
; CHECK-NEXT: mov v0.d[1], x9
; CHECK-NEXT: mov v2.d[1], x14
; CHECK-NEXT: mov v1.d[1], x15
; CHECK-NEXT: mov v3.d[1], x13
; CHECK-NEXT: ret
%a = call <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float> %x)
ret <8 x i64> %a
}
declare <8 x i64> @llvm.llrint.v8i64.v8f32(<8 x float>)
define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) nounwind {
; CHECK-LABEL: llrint_v16i64_v16f32:
; CHECK: // %bb.0:
; CHECK-NEXT: frintx v3.4s, v3.4s
; CHECK-NEXT: frintx v2.4s, v2.4s
; CHECK-NEXT: frintx v1.4s, v1.4s
; CHECK-NEXT: frintx v0.4s, v0.4s
; CHECK-NEXT: mov s4, v3.s[2]
; CHECK-NEXT: mov s5, v2.s[2]
; CHECK-NEXT: mov s6, v1.s[2]
; CHECK-NEXT: mov s7, v0.s[2]
; CHECK-NEXT: fcvtzs x10, s1
; CHECK-NEXT: fcvtzs x11, s0
; CHECK-NEXT: mov s16, v0.s[1]
; CHECK-NEXT: mov s17, v1.s[1]
; CHECK-NEXT: mov s18, v3.s[1]
; CHECK-NEXT: fcvtzs x14, s3
; CHECK-NEXT: fcvtzs x16, s2
; CHECK-NEXT: fcvtzs x8, s4
; CHECK-NEXT: mov s4, v2.s[1]
; CHECK-NEXT: fcvtzs x9, s5
; CHECK-NEXT: mov s5, v1.s[3]
; CHECK-NEXT: fcvtzs x12, s6
; CHECK-NEXT: mov s6, v0.s[3]
; CHECK-NEXT: fcvtzs x13, s7
; CHECK-NEXT: mov s7, v3.s[3]
; CHECK-NEXT: fmov d0, x11
; CHECK-NEXT: fcvtzs x17, s16
; CHECK-NEXT: fcvtzs x18, s18
; CHECK-NEXT: fcvtzs x15, s4
; CHECK-NEXT: mov s4, v2.s[3]
; CHECK-NEXT: fmov d2, x10
; CHECK-NEXT: fcvtzs x11, s5
; CHECK-NEXT: fcvtzs x10, s6
; CHECK-NEXT: fmov d3, x12
; CHECK-NEXT: fmov d1, x13
; CHECK-NEXT: fcvtzs x12, s17
; CHECK-NEXT: fcvtzs x13, s7
; CHECK-NEXT: fmov d5, x9
; CHECK-NEXT: fmov d6, x14
; CHECK-NEXT: fmov d7, x8
; CHECK-NEXT: fcvtzs x0, s4
; CHECK-NEXT: fmov d4, x16
; CHECK-NEXT: mov v0.d[1], x17
; CHECK-NEXT: mov v1.d[1], x10
; CHECK-NEXT: mov v3.d[1], x11
; CHECK-NEXT: mov v2.d[1], x12
; CHECK-NEXT: mov v6.d[1], x18
; CHECK-NEXT: mov v7.d[1], x13
; CHECK-NEXT: mov v4.d[1], x15
; CHECK-NEXT: mov v5.d[1], x0
; CHECK-NEXT: ret
%a = call <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float> %x)
ret <16 x i64> %a
}
declare <16 x i64> @llvm.llrint.v16i64.v16f32(<16 x float>)
define <32 x i64> @llrint_v32i64_v32f32(<32 x float> %x) nounwind {
; CHECK-LABEL: llrint_v32i64_v32f32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT: sub x9, sp, #272
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK-NEXT: frintx v0.4s, v0.4s
; CHECK-NEXT: frintx v1.4s, v1.4s
; CHECK-NEXT: frintx v2.4s, v2.4s
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: mov s16, v0.s[3]
; CHECK-NEXT: mov s17, v0.s[2]
; CHECK-NEXT: mov s18, v0.s[1]
; CHECK-NEXT: fcvtzs x12, s0
; CHECK-NEXT: frintx v0.4s, v3.4s
; CHECK-NEXT: mov s3, v2.s[3]
; CHECK-NEXT: fcvtzs x9, s16
; CHECK-NEXT: mov s16, v1.s[3]
; CHECK-NEXT: fcvtzs x10, s17
; CHECK-NEXT: mov s17, v1.s[2]
; CHECK-NEXT: fcvtzs x11, s18
; CHECK-NEXT: mov s18, v1.s[1]
; CHECK-NEXT: fcvtzs x13, s16
; CHECK-NEXT: stp x10, x9, [sp, #16]
; CHECK-NEXT: mov s16, v2.s[2]
; CHECK-NEXT: fcvtzs x9, s17
; CHECK-NEXT: fcvtzs x10, s18
; CHECK-NEXT: mov s17, v2.s[1]
; CHECK-NEXT: stp x12, x11, [sp]
; CHECK-NEXT: fcvtzs x11, s1
; CHECK-NEXT: frintx v1.4s, v4.4s
; CHECK-NEXT: fcvtzs x12, s3
; CHECK-NEXT: mov s3, v0.s[3]
; CHECK-NEXT: mov s4, v0.s[2]
; CHECK-NEXT: stp x9, x13, [sp, #48]
; CHECK-NEXT: fcvtzs x13, s16
; CHECK-NEXT: fcvtzs x9, s17
; CHECK-NEXT: mov s16, v0.s[1]
; CHECK-NEXT: stp x11, x10, [sp, #32]
; CHECK-NEXT: fcvtzs x10, s2
; CHECK-NEXT: frintx v2.4s, v5.4s
; CHECK-NEXT: fcvtzs x11, s3
; CHECK-NEXT: mov s3, v1.s[3]
; CHECK-NEXT: mov s5, v1.s[1]
; CHECK-NEXT: stp x13, x12, [sp, #80]
; CHECK-NEXT: fcvtzs x12, s4
; CHECK-NEXT: mov s4, v1.s[2]
; CHECK-NEXT: fcvtzs x13, s16
; CHECK-NEXT: stp x10, x9, [sp, #64]
; CHECK-NEXT: fcvtzs x9, s0
; CHECK-NEXT: mov s0, v2.s[3]
; CHECK-NEXT: fcvtzs x10, s3
; CHECK-NEXT: frintx v3.4s, v6.4s
; CHECK-NEXT: stp x12, x11, [sp, #112]
; CHECK-NEXT: fcvtzs x11, s4
; CHECK-NEXT: mov s4, v2.s[2]
; CHECK-NEXT: fcvtzs x12, s5
; CHECK-NEXT: mov s5, v2.s[1]
; CHECK-NEXT: stp x9, x13, [sp, #96]
; CHECK-NEXT: fcvtzs x9, s1
; CHECK-NEXT: fcvtzs x13, s0
; CHECK-NEXT: mov s0, v3.s[3]
; CHECK-NEXT: frintx v1.4s, v7.4s
; CHECK-NEXT: stp x11, x10, [sp, #144]
; CHECK-NEXT: fcvtzs x10, s4
; CHECK-NEXT: mov s4, v3.s[2]
; CHECK-NEXT: fcvtzs x11, s5
; CHECK-NEXT: mov s5, v3.s[1]
; CHECK-NEXT: stp x9, x12, [sp, #128]
; CHECK-NEXT: fcvtzs x9, s2
; CHECK-NEXT: fcvtzs x12, s0
; CHECK-NEXT: mov s0, v1.s[3]
; CHECK-NEXT: mov s2, v1.s[2]
; CHECK-NEXT: stp x10, x13, [sp, #176]
; CHECK-NEXT: fcvtzs x10, s4
; CHECK-NEXT: mov s4, v1.s[1]
; CHECK-NEXT: fcvtzs x13, s5
; CHECK-NEXT: stp x9, x11, [sp, #160]
; CHECK-NEXT: fcvtzs x9, s3
; CHECK-NEXT: fcvtzs x11, s0
; CHECK-NEXT: stp x10, x12, [sp, #208]
; CHECK-NEXT: fcvtzs x10, s2
; CHECK-NEXT: fcvtzs x12, s4
; CHECK-NEXT: stp x9, x13, [sp, #192]
; CHECK-NEXT: fcvtzs x9, s1
; CHECK-NEXT: stp x10, x11, [sp, #240]
; CHECK-NEXT: add x10, sp, #64
; CHECK-NEXT: stp x9, x12, [sp, #224]
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x9]
; CHECK-NEXT: add x9, sp, #32
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x10]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9]
; CHECK-NEXT: add x9, sp, #224
; CHECK-NEXT: add x10, sp, #96
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x9]
; CHECK-NEXT: add x9, sp, #192
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x10]
; CHECK-NEXT: add x10, sp, #160
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x9]
; CHECK-NEXT: add x9, sp, #128
; CHECK-NEXT: ld1d { z6.d }, p0/z, [x10]
; CHECK-NEXT: mov x10, #28 // =0x1c
; CHECK-NEXT: ld1d { z7.d }, p0/z, [x9]
; CHECK-NEXT: mov x9, #24 // =0x18
; CHECK-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3]
; CHECK-NEXT: st1d { z5.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #20 // =0x14
; CHECK-NEXT: st1d { z6.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #16 // =0x10
; CHECK-NEXT: st1d { z7.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #12 // =0xc
; CHECK-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #8 // =0x8
; CHECK-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #4 // =0x4
; CHECK-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: st1d { z0.d }, p0, [x8]
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%a = call <32 x i64> @llvm.llrint.v32i64.v32f32(<32 x float> %x)
ret <32 x i64> %a
}
declare <32 x i64> @llvm.llrint.v32i64.v32f32(<32 x float>)
define <1 x i64> @llrint_v1i64_v1f64(<1 x double> %x) nounwind {
; CHECK-LABEL: llrint_v1i64_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: frintx d0, d0
; CHECK-NEXT: fcvtzs x8, d0
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: ret
%a = call <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double> %x)
ret <1 x i64> %a
}
declare <1 x i64> @llvm.llrint.v1i64.v1f64(<1 x double>)
define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) nounwind {
; CHECK-LABEL: llrint_v2i64_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: frintx v0.2d, v0.2d
; CHECK-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-NEXT: ret
%a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x)
ret <2 x i64> %a
}
declare <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double>)
define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) nounwind {
; CHECK-LABEL: llrint_v4i64_v4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: frintx z0.d, p0/m, z0.d
; CHECK-NEXT: mov z1.d, z0.d[2]
; CHECK-NEXT: mov z2.d, z0.d[3]
; CHECK-NEXT: mov z3.d, z0.d[1]
; CHECK-NEXT: fcvtzs x9, d0
; CHECK-NEXT: fcvtzs x8, d1
; CHECK-NEXT: fcvtzs x10, d2
; CHECK-NEXT: fcvtzs x11, d3
; CHECK-NEXT: fmov d0, x9
; CHECK-NEXT: fmov d1, x8
; CHECK-NEXT: mov v0.d[1], x11
; CHECK-NEXT: mov v1.d[1], x10
; CHECK-NEXT: ret
%a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x)
ret <4 x i64> %a
}
declare <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double>)
define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) nounwind {
; CHECK-LABEL: llrint_v8i64_v8f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: splice z2.d, p0, z2.d, z3.d
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: frintx z0.d, p0/m, z0.d
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: frintx z1.d, p0/m, z2.d
; CHECK-NEXT: mov z4.d, z1.d[2]
; CHECK-NEXT: mov z5.d, z0.d[2]
; CHECK-NEXT: mov z2.d, z0.d[1]
; CHECK-NEXT: mov z3.d, z1.d[3]
; CHECK-NEXT: mov z6.d, z0.d[3]
; CHECK-NEXT: fcvtzs x8, d0
; CHECK-NEXT: mov z0.d, z1.d[1]
; CHECK-NEXT: fcvtzs x10, d1
; CHECK-NEXT: fcvtzs x11, d4
; CHECK-NEXT: fcvtzs x12, d5
; CHECK-NEXT: fcvtzs x9, d2
; CHECK-NEXT: fcvtzs x13, d3
; CHECK-NEXT: fcvtzs x14, d6
; CHECK-NEXT: fcvtzs x15, d0
; CHECK-NEXT: fmov d0, x8
; CHECK-NEXT: fmov d2, x10
; CHECK-NEXT: fmov d1, x12
; CHECK-NEXT: fmov d3, x11
; CHECK-NEXT: mov v0.d[1], x9
; CHECK-NEXT: mov v2.d[1], x15
; CHECK-NEXT: mov v1.d[1], x14
; CHECK-NEXT: mov v3.d[1], x13
; CHECK-NEXT: ret
%a = call <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double> %x)
ret <8 x i64> %a
}
declare <8 x i64> @llvm.llrint.v8i64.v8f64(<8 x double>)
define <16 x i64> @llrint_v16f64(<16 x double> %x) nounwind {
; CHECK-LABEL: llrint_v16f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d, vl2
; CHECK-NEXT: // kill: def $q6 killed $q6 def $z6
; CHECK-NEXT: // kill: def $q4 killed $q4 def $z4
; CHECK-NEXT: // kill: def $q7 killed $q7 def $z7
; CHECK-NEXT: // kill: def $q5 killed $q5 def $z5
; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: splice z6.d, p1, z6.d, z7.d
; CHECK-NEXT: splice z4.d, p1, z4.d, z5.d
; CHECK-NEXT: splice z2.d, p1, z2.d, z3.d
; CHECK-NEXT: splice z0.d, p1, z0.d, z1.d
; CHECK-NEXT: movprfx z3, z6
; CHECK-NEXT: frintx z3.d, p0/m, z6.d
; CHECK-NEXT: movprfx z1, z4
; CHECK-NEXT: frintx z1.d, p0/m, z4.d
; CHECK-NEXT: frintx z2.d, p0/m, z2.d
; CHECK-NEXT: frintx z0.d, p0/m, z0.d
; CHECK-NEXT: mov z4.d, z3.d[2]
; CHECK-NEXT: mov z5.d, z1.d[2]
; CHECK-NEXT: mov z6.d, z2.d[3]
; CHECK-NEXT: fcvtzs x11, d0
; CHECK-NEXT: fcvtzs x12, d1
; CHECK-NEXT: fcvtzs x13, d2
; CHECK-NEXT: fcvtzs x14, d3
; CHECK-NEXT: mov z7.d, z3.d[3]
; CHECK-NEXT: mov z16.d, z1.d[3]
; CHECK-NEXT: fcvtzs x9, d4
; CHECK-NEXT: fcvtzs x10, d5
; CHECK-NEXT: mov z4.d, z2.d[2]
; CHECK-NEXT: mov z5.d, z0.d[2]
; CHECK-NEXT: fcvtzs x8, d6
; CHECK-NEXT: mov z2.d, z2.d[1]
; CHECK-NEXT: mov z6.d, z0.d[3]
; CHECK-NEXT: mov z1.d, z1.d[1]
; CHECK-NEXT: mov z3.d, z3.d[1]
; CHECK-NEXT: fcvtzs x15, d4
; CHECK-NEXT: mov z4.d, z0.d[1]
; CHECK-NEXT: fmov d0, x11
; CHECK-NEXT: fcvtzs x16, d5
; CHECK-NEXT: fcvtzs x11, d2
; CHECK-NEXT: fmov d2, x13
; CHECK-NEXT: fcvtzs x17, d7
; CHECK-NEXT: fcvtzs x18, d16
; CHECK-NEXT: fcvtzs x0, d3
; CHECK-NEXT: fcvtzs x13, d4
; CHECK-NEXT: fmov d4, x12
; CHECK-NEXT: fcvtzs x12, d6
; CHECK-NEXT: fmov d6, x14
; CHECK-NEXT: fcvtzs x14, d1
; CHECK-NEXT: fmov d3, x15
; CHECK-NEXT: fmov d1, x16
; CHECK-NEXT: fmov d5, x10
; CHECK-NEXT: fmov d7, x9
; CHECK-NEXT: mov v2.d[1], x11
; CHECK-NEXT: mov v0.d[1], x13
; CHECK-NEXT: mov v3.d[1], x8
; CHECK-NEXT: mov v6.d[1], x0
; CHECK-NEXT: mov v4.d[1], x14
; CHECK-NEXT: mov v1.d[1], x12
; CHECK-NEXT: mov v5.d[1], x18
; CHECK-NEXT: mov v7.d[1], x17
; CHECK-NEXT: ret
%a = call <16 x i64> @llvm.llrint.v16i64.v16f64(<16 x double> %x)
ret <16 x i64> %a
}
declare <16 x i64> @llvm.llrint.v16i64.v16f64(<16 x double>)
define <32 x i64> @llrint_v32f64(<32 x double> %x) nounwind {
; CHECK-LABEL: llrint_v32f64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT: sub x9, sp, #272
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK-NEXT: ptrue p1.d, vl2
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: // kill: def $q3 killed $q3 def $z3
; CHECK-NEXT: // kill: def $q2 killed $q2 def $z2
; CHECK-NEXT: // kill: def $q7 killed $q7 def $z7
; CHECK-NEXT: // kill: def $q6 killed $q6 def $z6
; CHECK-NEXT: // kill: def $q4 killed $q4 def $z4
; CHECK-NEXT: // kill: def $q5 killed $q5 def $z5
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: splice z0.d, p1, z0.d, z1.d
; CHECK-NEXT: splice z2.d, p1, z2.d, z3.d
; CHECK-NEXT: splice z4.d, p1, z4.d, z5.d
; CHECK-NEXT: splice z6.d, p1, z6.d, z7.d
; CHECK-NEXT: ldp q5, q19, [x29, #16]
; CHECK-NEXT: movprfx z3, z0
; CHECK-NEXT: frintx z3.d, p0/m, z0.d
; CHECK-NEXT: movprfx z16, z2
; CHECK-NEXT: frintx z16.d, p0/m, z2.d
; CHECK-NEXT: frintx z4.d, p0/m, z4.d
; CHECK-NEXT: splice z5.d, p1, z5.d, z19.d
; CHECK-NEXT: frintx z6.d, p0/m, z6.d
; CHECK-NEXT: ldp q2, q17, [x29, #48]
; CHECK-NEXT: ldp q0, q1, [x29, #112]
; CHECK-NEXT: mov z18.d, z3.d[3]
; CHECK-NEXT: mov z7.d, z3.d[2]
; CHECK-NEXT: fcvtzs x9, d3
; CHECK-NEXT: mov z3.d, z3.d[1]
; CHECK-NEXT: mov z20.d, z16.d[3]
; CHECK-NEXT: fcvtzs x12, d16
; CHECK-NEXT: splice z2.d, p1, z2.d, z17.d
; CHECK-NEXT: frintx z5.d, p0/m, z5.d
; CHECK-NEXT: splice z0.d, p1, z0.d, z1.d
; CHECK-NEXT: fcvtzs x10, d18
; CHECK-NEXT: fcvtzs x11, d7
; CHECK-NEXT: mov z18.d, z16.d[2]
; CHECK-NEXT: mov z7.d, z16.d[1]
; CHECK-NEXT: fcvtzs x13, d3
; CHECK-NEXT: str x9, [sp, #128]
; CHECK-NEXT: fcvtzs x9, d20
; CHECK-NEXT: mov z16.d, z4.d[3]
; CHECK-NEXT: ldp q3, q19, [x29, #80]
; CHECK-NEXT: frintx z2.d, p0/m, z2.d
; CHECK-NEXT: stp x11, x10, [sp, #144]
; CHECK-NEXT: fcvtzs x10, d18
; CHECK-NEXT: fcvtzs x11, d7
; CHECK-NEXT: mov z18.d, z4.d[2]
; CHECK-NEXT: mov z7.d, z4.d[1]
; CHECK-NEXT: str x13, [sp, #136]
; CHECK-NEXT: fcvtzs x13, d16
; CHECK-NEXT: mov z16.d, z6.d[3]
; CHECK-NEXT: splice z3.d, p1, z3.d, z19.d
; CHECK-NEXT: mov z1.d, z5.d[1]
; CHECK-NEXT: frintx z0.d, p0/m, z0.d
; CHECK-NEXT: stp x10, x9, [sp, #176]
; CHECK-NEXT: fcvtzs x9, d18
; CHECK-NEXT: fcvtzs x10, d4
; CHECK-NEXT: stp x12, x11, [sp, #160]
; CHECK-NEXT: fcvtzs x11, d7
; CHECK-NEXT: mov z4.d, z6.d[2]
; CHECK-NEXT: mov z7.d, z6.d[1]
; CHECK-NEXT: fcvtzs x12, d6
; CHECK-NEXT: mov z6.d, z5.d[2]
; CHECK-NEXT: frintx z3.d, p0/m, z3.d
; CHECK-NEXT: stp x9, x13, [sp, #208]
; CHECK-NEXT: fcvtzs x9, d16
; CHECK-NEXT: fcvtzs x13, d4
; CHECK-NEXT: stp x10, x11, [sp, #192]
; CHECK-NEXT: fcvtzs x10, d7
; CHECK-NEXT: mov z4.d, z5.d[3]
; CHECK-NEXT: fcvtzs x11, d4
; CHECK-NEXT: stp x13, x9, [sp, #240]
; CHECK-NEXT: fcvtzs x9, d6
; CHECK-NEXT: stp x12, x10, [sp, #224]
; CHECK-NEXT: fcvtzs x10, d5
; CHECK-NEXT: fcvtzs x12, d1
; CHECK-NEXT: mov z4.d, z2.d[3]
; CHECK-NEXT: mov z5.d, z2.d[2]
; CHECK-NEXT: mov z1.d, z2.d[1]
; CHECK-NEXT: fcvtzs x13, d2
; CHECK-NEXT: mov z2.d, z3.d[2]
; CHECK-NEXT: stp x9, x11, [sp, #16]
; CHECK-NEXT: fcvtzs x9, d4
; CHECK-NEXT: fcvtzs x11, d5
; CHECK-NEXT: stp x10, x12, [sp]
; CHECK-NEXT: fcvtzs x10, d1
; CHECK-NEXT: mov z4.d, z3.d[3]
; CHECK-NEXT: mov z1.d, z3.d[1]
; CHECK-NEXT: fcvtzs x12, d4
; CHECK-NEXT: stp x11, x9, [sp, #48]
; CHECK-NEXT: fcvtzs x9, d2
; CHECK-NEXT: fcvtzs x11, d3
; CHECK-NEXT: stp x13, x10, [sp, #32]
; CHECK-NEXT: fcvtzs x10, d1
; CHECK-NEXT: mov z2.d, z0.d[3]
; CHECK-NEXT: mov z3.d, z0.d[2]
; CHECK-NEXT: mov z1.d, z0.d[1]
; CHECK-NEXT: stp x9, x12, [sp, #80]
; CHECK-NEXT: fcvtzs x12, d0
; CHECK-NEXT: fcvtzs x13, d2
; CHECK-NEXT: fcvtzs x9, d3
; CHECK-NEXT: stp x11, x10, [sp, #64]
; CHECK-NEXT: fcvtzs x10, d1
; CHECK-NEXT: stp x9, x13, [sp, #112]
; CHECK-NEXT: add x9, sp, #128
; CHECK-NEXT: stp x12, x10, [sp, #96]
; CHECK-NEXT: add x10, sp, #192
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x9]
; CHECK-NEXT: add x9, sp, #160
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x10]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9]
; CHECK-NEXT: add x9, sp, #96
; CHECK-NEXT: add x10, sp, #224
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x9]
; CHECK-NEXT: add x9, sp, #64
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x10]
; CHECK-NEXT: add x10, sp, #32
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x9]
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: ld1d { z6.d }, p0/z, [x10]
; CHECK-NEXT: mov x10, #28 // =0x1c
; CHECK-NEXT: ld1d { z7.d }, p0/z, [x9]
; CHECK-NEXT: mov x9, #24 // =0x18
; CHECK-NEXT: st1d { z3.d }, p0, [x8, x10, lsl #3]
; CHECK-NEXT: st1d { z5.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #20 // =0x14
; CHECK-NEXT: st1d { z6.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #16 // =0x10
; CHECK-NEXT: st1d { z7.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #12 // =0xc
; CHECK-NEXT: st1d { z4.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #8 // =0x8
; CHECK-NEXT: st1d { z2.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: mov x9, #4 // =0x4
; CHECK-NEXT: st1d { z1.d }, p0, [x8, x9, lsl #3]
; CHECK-NEXT: st1d { z0.d }, p0, [x8]
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%a = call <32 x i64> @llvm.llrint.v32i64.v16f64(<32 x double> %x)
ret <32 x i64> %a
}
declare <32 x i64> @llvm.llrint.v32i64.v32f64(<32 x double>)
define <1 x i64> @llrint_v1i64_v1fp128(<1 x fp128> %x) nounwind {
; CHECK-LABEL: llrint_v1i64_v1fp128:
; CHECK: // %bb.0:
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%a = call <1 x i64> @llvm.llrint.v1i64.v1fp128(<1 x fp128> %x)
ret <1 x i64> %a
}
declare <1 x i64> @llvm.llrint.v1i64.v1fp128(<1 x fp128>)
define <2 x i64> @llrint_v2i64_v2fp128(<2 x fp128> %x) nounwind {
; CHECK-LABEL: llrint_v2i64_v2fp128:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #48
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: add sp, sp, #48
; CHECK-NEXT: ret
%a = call <2 x i64> @llvm.llrint.v2i64.v2fp128(<2 x fp128> %x)
ret <2 x i64> %a
}
declare <2 x i64> @llvm.llrint.v2i64.v2fp128(<2 x fp128>)
define <4 x i64> @llrint_v4i64_v4fp128(<4 x fp128> %x) nounwind {
; CHECK-LABEL: llrint_v4i64_v4fp128:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #64
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: mov v0.16b, v3.16b
; CHECK-NEXT: stp q2, q1, [sp, #16] // 32-byte Folded Spill
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #64
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #64
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: movprfx z1, z0
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #64
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%a = call <4 x i64> @llvm.llrint.v4i64.v4fp128(<4 x fp128> %x)
ret <4 x i64> %a
}
declare <4 x i64> @llvm.llrint.v4i64.v4fp128(<4 x fp128>)
define <8 x i64> @llrint_v8i64_v8fp128(<8 x fp128> %x) nounwind {
; CHECK-LABEL: llrint_v8i64_v8fp128:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #128
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: str q0, [sp, #112] // 16-byte Folded Spill
; CHECK-NEXT: mov v0.16b, v7.16b
; CHECK-NEXT: stp q6, q5, [sp, #16] // 32-byte Folded Spill
; CHECK-NEXT: stp q4, q3, [sp, #48] // 32-byte Folded Spill
; CHECK-NEXT: stp q2, q1, [sp, #80] // 32-byte Folded Spill
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #128
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #128
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #128
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #128
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: ldr z2, [x8, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload
; CHECK-NEXT: movprfx z3, z2
; CHECK-NEXT: ext z3.b, z3.b, z2.b, #16
; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: // kill: def $q3 killed $q3 killed $z3
; CHECK-NEXT: movprfx z1, z0
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: add sp, sp, #128
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%a = call <8 x i64> @llvm.llrint.v8i64.v8fp128(<8 x fp128> %x)
ret <8 x i64> %a
}
declare <8 x i64> @llvm.llrint.v8i64.v8fp128(<8 x fp128>)
define <16 x i64> @llrint_v16fp128(<16 x fp128> %x) nounwind {
; CHECK-LABEL: llrint_v16fp128:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #256
; CHECK-NEXT: addvl sp, sp, #-4
; CHECK-NEXT: addvl x8, sp, #4
; CHECK-NEXT: str q1, [sp, #240] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, #272]
; CHECK-NEXT: addvl x8, sp, #4
; CHECK-NEXT: str q0, [sp, #224] // 16-byte Folded Spill
; CHECK-NEXT: stp q7, q6, [sp, #128] // 32-byte Folded Spill
; CHECK-NEXT: str q1, [sp, #112] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, #288]
; CHECK-NEXT: addvl x8, sp, #4
; CHECK-NEXT: stp q5, q4, [sp, #160] // 32-byte Folded Spill
; CHECK-NEXT: str q1, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, #304]
; CHECK-NEXT: addvl x8, sp, #4
; CHECK-NEXT: stp q3, q2, [sp, #192] // 32-byte Folded Spill
; CHECK-NEXT: str q1, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, #320]
; CHECK-NEXT: addvl x8, sp, #4
; CHECK-NEXT: str q1, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, #336]
; CHECK-NEXT: addvl x8, sp, #4
; CHECK-NEXT: str q1, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, #352]
; CHECK-NEXT: addvl x8, sp, #4
; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, #368]
; CHECK-NEXT: addvl x8, sp, #4
; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: ldr q1, [x8, #384]
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #256
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #256
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr z1, [x8, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #256
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #256
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr z1, [x8, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #128] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #128] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #144] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #128] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #256
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #160] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #160] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #176] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #160] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #256
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #192] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #192] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #208] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #192] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #256
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #240] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: str q0, [sp, #240] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #224] // 16-byte Folded Reload
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldr q1, [sp, #240] // 16-byte Folded Reload
; CHECK-NEXT: add x8, sp, #256
; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: ldr z2, [x8, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z4, [x8, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z6, [x8, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload
; CHECK-NEXT: movprfx z3, z2
; CHECK-NEXT: ext z3.b, z3.b, z2.b, #16
; CHECK-NEXT: movprfx z5, z4
; CHECK-NEXT: ext z5.b, z5.b, z4.b, #16
; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2
; CHECK-NEXT: // kill: def $q4 killed $q4 killed $z4
; CHECK-NEXT: movprfx z7, z6
; CHECK-NEXT: ext z7.b, z7.b, z6.b, #16
; CHECK-NEXT: // kill: def $q6 killed $q6 killed $z6
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: // kill: def $q3 killed $q3 killed $z3
; CHECK-NEXT: // kill: def $q5 killed $q5 killed $z5
; CHECK-NEXT: // kill: def $q7 killed $q7 killed $z7
; CHECK-NEXT: movprfx z1, z0
; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: add sp, sp, #256
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%a = call <16 x i64> @llvm.llrint.v16i64.v16fp128(<16 x fp128> %x)
ret <16 x i64> %a
}
declare <16 x i64> @llvm.llrint.v16i64.v16fp128(<16 x fp128>)