llvm-project/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
Eli Friedman c83f23d6ab
[AArch64] Fix heuristics for folding "lsl" into load/store ops. (#86894)
The existing heuristics were assuming that every core behaves like an
Apple A7, where any extend/shift costs an extra micro-op... but in
reality, nothing else behaves like that.

On some older Cortex designs, shifts by 1 or 4 cost extra, but all other
shifts/extensions are free. On all other cores, as far as I can tell,
all shifts/extensions for integer loads are free (i.e. the same cost as
an unshifted load).

To reflect this, this patch:

- Enables aggressive folding of shifts into loads by default.

- Removes the old AddrLSLFast feature, since it applies to everything
except A7 (and even if you are explicitly targeting A7, we want to
assume extensions are free because the code will almost always run on a
newer core).

- Adds a new feature AddrLSLSlow14 that applies specifically to the
Cortex cores where shifts by 1 or 4 cost extra.

I didn't add support for AddrLSLSlow14 on the GlobalISel side because it
would require a bunch of refactoring to work correctly. Someone can pick
this up as a followup.
2024-04-04 11:25:44 -07:00

750 lines
24 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
; rdar://9428579
%type1 = type { <16 x i8> }
%type2 = type { <8 x i8> }
%type3 = type { <4 x i16> }
define hidden fastcc void @t1(ptr %argtable) nounwind {
; CHECK-LABEL: t1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: ldr x8, [x0]
; CHECK-NEXT: str q0, [x8]
; CHECK-NEXT: ret
entry:
%tmp1 = load ptr, ptr %argtable, align 8
store <16 x i8> zeroinitializer, ptr %tmp1, align 16
ret void
}
define hidden fastcc void @t2(ptr %argtable) nounwind {
; CHECK-LABEL: t2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: ldr x8, [x0]
; CHECK-NEXT: str d0, [x8]
; CHECK-NEXT: ret
entry:
%tmp1 = load ptr, ptr %argtable, align 8
store <8 x i8> zeroinitializer, ptr %tmp1, align 8
ret void
}
; add a bunch of tests for rdar://11246289
@globalArray64x2 = common global ptr null, align 8
@globalArray32x4 = common global ptr null, align 8
@globalArray16x8 = common global ptr null, align 8
@globalArray8x16 = common global ptr null, align 8
@globalArray64x1 = common global ptr null, align 8
@globalArray32x2 = common global ptr null, align 8
@globalArray16x4 = common global ptr null, align 8
@globalArray8x8 = common global ptr null, align 8
@floatglobalArray64x2 = common global ptr null, align 8
@floatglobalArray32x4 = common global ptr null, align 8
@floatglobalArray64x1 = common global ptr null, align 8
@floatglobalArray32x2 = common global ptr null, align 8
define void @fct1_64x2(ptr nocapture %array, i64 %offset) nounwind ssp {
; CHECK-LABEL: fct1_64x2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray64x2
; CHECK-NEXT: lsl x9, x1, #4
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray64x2]
; CHECK-NEXT: ldr q0, [x0, x9]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str q0, [x8, x9]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <2 x i64>, ptr %array, i64 %offset
%tmp = load <2 x i64>, ptr %arrayidx, align 16
%tmp1 = load ptr, ptr @globalArray64x2, align 8
%arrayidx1 = getelementptr inbounds <2 x i64>, ptr %tmp1, i64 %offset
store <2 x i64> %tmp, ptr %arrayidx1, align 16
ret void
}
define void @fct2_64x2(ptr nocapture %array) nounwind ssp {
; CHECK-LABEL: fct2_64x2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray64x2
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray64x2]
; CHECK-NEXT: ldr q0, [x0, #48]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str q0, [x8, #80]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <2 x i64>, ptr %array, i64 3
%tmp = load <2 x i64>, ptr %arrayidx, align 16
%tmp1 = load ptr, ptr @globalArray64x2, align 8
%arrayidx1 = getelementptr inbounds <2 x i64>, ptr %tmp1, i64 5
store <2 x i64> %tmp, ptr %arrayidx1, align 16
ret void
}
define void @fct1_32x4(ptr nocapture %array, i64 %offset) nounwind ssp {
; CHECK-LABEL: fct1_32x4:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray32x4
; CHECK-NEXT: lsl x9, x1, #4
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray32x4]
; CHECK-NEXT: ldr q0, [x0, x9]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str q0, [x8, x9]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <4 x i32>, ptr %array, i64 %offset
%tmp = load <4 x i32>, ptr %arrayidx, align 16
%tmp1 = load ptr, ptr @globalArray32x4, align 8
%arrayidx1 = getelementptr inbounds <4 x i32>, ptr %tmp1, i64 %offset
store <4 x i32> %tmp, ptr %arrayidx1, align 16
ret void
}
define void @fct2_32x4(ptr nocapture %array) nounwind ssp {
; CHECK-LABEL: fct2_32x4:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray32x4
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray32x4]
; CHECK-NEXT: ldr q0, [x0, #48]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str q0, [x8, #80]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <4 x i32>, ptr %array, i64 3
%tmp = load <4 x i32>, ptr %arrayidx, align 16
%tmp1 = load ptr, ptr @globalArray32x4, align 8
%arrayidx1 = getelementptr inbounds <4 x i32>, ptr %tmp1, i64 5
store <4 x i32> %tmp, ptr %arrayidx1, align 16
ret void
}
define void @fct1_16x8(ptr nocapture %array, i64 %offset) nounwind ssp {
; CHECK-LABEL: fct1_16x8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray16x8
; CHECK-NEXT: lsl x9, x1, #4
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray16x8]
; CHECK-NEXT: ldr q0, [x0, x9]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str q0, [x8, x9]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <8 x i16>, ptr %array, i64 %offset
%tmp = load <8 x i16>, ptr %arrayidx, align 16
%tmp1 = load ptr, ptr @globalArray16x8, align 8
%arrayidx1 = getelementptr inbounds <8 x i16>, ptr %tmp1, i64 %offset
store <8 x i16> %tmp, ptr %arrayidx1, align 16
ret void
}
define void @fct2_16x8(ptr nocapture %array) nounwind ssp {
; CHECK-LABEL: fct2_16x8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray16x8
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray16x8]
; CHECK-NEXT: ldr q0, [x0, #48]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str q0, [x8, #80]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <8 x i16>, ptr %array, i64 3
%tmp = load <8 x i16>, ptr %arrayidx, align 16
%tmp1 = load ptr, ptr @globalArray16x8, align 8
%arrayidx1 = getelementptr inbounds <8 x i16>, ptr %tmp1, i64 5
store <8 x i16> %tmp, ptr %arrayidx1, align 16
ret void
}
define void @fct1_8x16(ptr nocapture %array, i64 %offset) nounwind ssp {
; CHECK-LABEL: fct1_8x16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray8x16
; CHECK-NEXT: lsl x9, x1, #4
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray8x16]
; CHECK-NEXT: ldr q0, [x0, x9]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str q0, [x8, x9]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <16 x i8>, ptr %array, i64 %offset
%tmp = load <16 x i8>, ptr %arrayidx, align 16
%tmp1 = load ptr, ptr @globalArray8x16, align 8
%arrayidx1 = getelementptr inbounds <16 x i8>, ptr %tmp1, i64 %offset
store <16 x i8> %tmp, ptr %arrayidx1, align 16
ret void
}
define void @fct2_8x16(ptr nocapture %array) nounwind ssp {
; CHECK-LABEL: fct2_8x16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray8x16
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray8x16]
; CHECK-NEXT: ldr q0, [x0, #48]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str q0, [x8, #80]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <16 x i8>, ptr %array, i64 3
%tmp = load <16 x i8>, ptr %arrayidx, align 16
%tmp1 = load ptr, ptr @globalArray8x16, align 8
%arrayidx1 = getelementptr inbounds <16 x i8>, ptr %tmp1, i64 5
store <16 x i8> %tmp, ptr %arrayidx1, align 16
ret void
}
define void @fct1_64x1(ptr nocapture %array, i64 %offset) nounwind ssp {
; CHECK-LABEL: fct1_64x1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray64x1
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray64x1]
; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str d0, [x8, x1, lsl #3]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <1 x i64>, ptr %array, i64 %offset
%tmp = load <1 x i64>, ptr %arrayidx, align 8
%tmp1 = load ptr, ptr @globalArray64x1, align 8
%arrayidx1 = getelementptr inbounds <1 x i64>, ptr %tmp1, i64 %offset
store <1 x i64> %tmp, ptr %arrayidx1, align 8
ret void
}
define void @fct2_64x1(ptr nocapture %array) nounwind ssp {
; CHECK-LABEL: fct2_64x1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray64x1
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray64x1]
; CHECK-NEXT: ldr d0, [x0, #24]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str d0, [x8, #40]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <1 x i64>, ptr %array, i64 3
%tmp = load <1 x i64>, ptr %arrayidx, align 8
%tmp1 = load ptr, ptr @globalArray64x1, align 8
%arrayidx1 = getelementptr inbounds <1 x i64>, ptr %tmp1, i64 5
store <1 x i64> %tmp, ptr %arrayidx1, align 8
ret void
}
define void @fct1_32x2(ptr nocapture %array, i64 %offset) nounwind ssp {
; CHECK-LABEL: fct1_32x2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray32x2
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray32x2]
; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str d0, [x8, x1, lsl #3]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <2 x i32>, ptr %array, i64 %offset
%tmp = load <2 x i32>, ptr %arrayidx, align 8
%tmp1 = load ptr, ptr @globalArray32x2, align 8
%arrayidx1 = getelementptr inbounds <2 x i32>, ptr %tmp1, i64 %offset
store <2 x i32> %tmp, ptr %arrayidx1, align 8
ret void
}
define void @fct2_32x2(ptr nocapture %array) nounwind ssp {
; CHECK-LABEL: fct2_32x2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray32x2
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray32x2]
; CHECK-NEXT: ldr d0, [x0, #24]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str d0, [x8, #40]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <2 x i32>, ptr %array, i64 3
%tmp = load <2 x i32>, ptr %arrayidx, align 8
%tmp1 = load ptr, ptr @globalArray32x2, align 8
%arrayidx1 = getelementptr inbounds <2 x i32>, ptr %tmp1, i64 5
store <2 x i32> %tmp, ptr %arrayidx1, align 8
ret void
}
define void @fct1_16x4(ptr nocapture %array, i64 %offset) nounwind ssp {
; CHECK-LABEL: fct1_16x4:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray16x4
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray16x4]
; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str d0, [x8, x1, lsl #3]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <4 x i16>, ptr %array, i64 %offset
%tmp = load <4 x i16>, ptr %arrayidx, align 8
%tmp1 = load ptr, ptr @globalArray16x4, align 8
%arrayidx1 = getelementptr inbounds <4 x i16>, ptr %tmp1, i64 %offset
store <4 x i16> %tmp, ptr %arrayidx1, align 8
ret void
}
define void @fct2_16x4(ptr nocapture %array) nounwind ssp {
; CHECK-LABEL: fct2_16x4:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray16x4
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray16x4]
; CHECK-NEXT: ldr d0, [x0, #24]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str d0, [x8, #40]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <4 x i16>, ptr %array, i64 3
%tmp = load <4 x i16>, ptr %arrayidx, align 8
%tmp1 = load ptr, ptr @globalArray16x4, align 8
%arrayidx1 = getelementptr inbounds <4 x i16>, ptr %tmp1, i64 5
store <4 x i16> %tmp, ptr %arrayidx1, align 8
ret void
}
define void @fct1_8x8(ptr nocapture %array, i64 %offset) nounwind ssp {
; CHECK-LABEL: fct1_8x8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: adrp x8, :got:globalArray8x8
; CHECK-NEXT: ldr x8, [x8, :got_lo12:globalArray8x8]
; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
; CHECK-NEXT: ldr x8, [x8]
; CHECK-NEXT: str d0, [x8, x1, lsl #3]
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds <8 x i8>, ptr %array, i64 %offset
%tmp = load <8 x i8>, ptr %arrayidx, align 8
%tmp1 = load ptr, ptr @globalArray8x8, align 8
%arrayidx1 = getelementptr inbounds <8 x i8>, ptr %tmp1, i64 %offset
store <8 x i8> %tmp, ptr %arrayidx1, align 8
ret void
}
; Add a bunch of tests for rdar://13258794: Match LDUR/STUR for D and Q
; registers for unscaled vector accesses
define <1 x i64> @fct0(ptr %str) nounwind readonly ssp {
; CHECK-LABEL: fct0:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur d0, [x0, #3]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <1 x i64>, ptr %p, align 8
ret <1 x i64> %0
}
define <2 x i32> @fct1(ptr %str) nounwind readonly ssp {
; CHECK-LABEL: fct1:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur d0, [x0, #3]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <2 x i32>, ptr %p, align 8
ret <2 x i32> %0
}
define <4 x i16> @fct2(ptr %str) nounwind readonly ssp {
; CHECK-LABEL: fct2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur d0, [x0, #3]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <4 x i16>, ptr %p, align 8
ret <4 x i16> %0
}
define <8 x i8> @fct3(ptr %str) nounwind readonly ssp {
; CHECK-LABEL: fct3:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur d0, [x0, #3]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <8 x i8>, ptr %p, align 8
ret <8 x i8> %0
}
define <2 x i64> @fct4(ptr %str) nounwind readonly ssp {
; CHECK-LABEL: fct4:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur q0, [x0, #3]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <2 x i64>, ptr %p, align 16
ret <2 x i64> %0
}
define <4 x i32> @fct5(ptr %str) nounwind readonly ssp {
; CHECK-LABEL: fct5:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur q0, [x0, #3]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <4 x i32>, ptr %p, align 16
ret <4 x i32> %0
}
define <8 x i16> @fct6(ptr %str) nounwind readonly ssp {
; CHECK-LABEL: fct6:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur q0, [x0, #3]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <8 x i16>, ptr %p, align 16
ret <8 x i16> %0
}
define <16 x i8> @fct7(ptr %str) nounwind readonly ssp {
; CHECK-LABEL: fct7:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur q0, [x0, #3]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <16 x i8>, ptr %p, align 16
ret <16 x i8> %0
}
define void @fct8(ptr %str) nounwind ssp {
; CHECK-LABEL: fct8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur d0, [x0, #3]
; CHECK-NEXT: stur d0, [x0, #4]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <1 x i64>, ptr %p, align 8
%p2 = getelementptr inbounds i8, ptr %str, i64 4
store <1 x i64> %0, ptr %p2, align 8
ret void
}
define void @fct9(ptr %str) nounwind ssp {
; CHECK-LABEL: fct9:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur d0, [x0, #3]
; CHECK-NEXT: stur d0, [x0, #4]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <2 x i32>, ptr %p, align 8
%p2 = getelementptr inbounds i8, ptr %str, i64 4
store <2 x i32> %0, ptr %p2, align 8
ret void
}
define void @fct10(ptr %str) nounwind ssp {
; CHECK-LABEL: fct10:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur d0, [x0, #3]
; CHECK-NEXT: stur d0, [x0, #4]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <4 x i16>, ptr %p, align 8
%p2 = getelementptr inbounds i8, ptr %str, i64 4
store <4 x i16> %0, ptr %p2, align 8
ret void
}
define void @fct11(ptr %str) nounwind ssp {
; CHECK-LABEL: fct11:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur d0, [x0, #3]
; CHECK-NEXT: stur d0, [x0, #4]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <8 x i8>, ptr %p, align 8
%p2 = getelementptr inbounds i8, ptr %str, i64 4
store <8 x i8> %0, ptr %p2, align 8
ret void
}
define void @fct12(ptr %str) nounwind ssp {
; CHECK-LABEL: fct12:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur q0, [x0, #3]
; CHECK-NEXT: stur q0, [x0, #4]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <2 x i64>, ptr %p, align 16
%p2 = getelementptr inbounds i8, ptr %str, i64 4
store <2 x i64> %0, ptr %p2, align 16
ret void
}
define void @fct13(ptr %str) nounwind ssp {
; CHECK-LABEL: fct13:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur q0, [x0, #3]
; CHECK-NEXT: stur q0, [x0, #4]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <4 x i32>, ptr %p, align 16
%p2 = getelementptr inbounds i8, ptr %str, i64 4
store <4 x i32> %0, ptr %p2, align 16
ret void
}
define void @fct14(ptr %str) nounwind ssp {
; CHECK-LABEL: fct14:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur q0, [x0, #3]
; CHECK-NEXT: stur q0, [x0, #4]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <8 x i16>, ptr %p, align 16
%p2 = getelementptr inbounds i8, ptr %str, i64 4
store <8 x i16> %0, ptr %p2, align 16
ret void
}
define void @fct15(ptr %str) nounwind ssp {
; CHECK-LABEL: fct15:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldur q0, [x0, #3]
; CHECK-NEXT: stur q0, [x0, #4]
; CHECK-NEXT: ret
entry:
%p = getelementptr inbounds i8, ptr %str, i64 3
%0 = load <16 x i8>, ptr %p, align 16
%p2 = getelementptr inbounds i8, ptr %str, i64 4
store <16 x i8> %0, ptr %p2, align 16
ret void
}
; Check the building of vector from a single loaded value.
; Part of <rdar://problem/14170854>
;
; Single loads with immediate offset.
define <8 x i8> @fct16(ptr nocapture %sp0) {
; CHECK-LABEL: fct16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: mul.8b v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i8, ptr %sp0, i64 1
%pix_sp0.0.copyload = load i8, ptr %addr, align 1
%vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <8 x i8> %vec, %vec
ret <8 x i8> %vmull.i
}
define <16 x i8> @fct17(ptr nocapture %sp0) {
; CHECK-LABEL: fct17:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: mul.16b v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i8, ptr %sp0, i64 1
%pix_sp0.0.copyload = load i8, ptr %addr, align 1
%vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <16 x i8> %vec, %vec
ret <16 x i8> %vmull.i
}
define <4 x i16> @fct18(ptr nocapture %sp0) {
; CHECK-LABEL: fct18:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr h0, [x0, #2]
; CHECK-NEXT: mul.4h v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i16, ptr %sp0, i64 1
%pix_sp0.0.copyload = load i16, ptr %addr, align 1
%vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <4 x i16> %vec, %vec
ret <4 x i16> %vmull.i
}
define <8 x i16> @fct19(ptr nocapture %sp0) {
; CHECK-LABEL: fct19:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr h0, [x0, #2]
; CHECK-NEXT: mul.8h v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i16, ptr %sp0, i64 1
%pix_sp0.0.copyload = load i16, ptr %addr, align 1
%vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <8 x i16> %vec, %vec
ret <8 x i16> %vmull.i
}
define <2 x i32> @fct20(ptr nocapture %sp0) {
; CHECK-LABEL: fct20:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr s0, [x0, #4]
; CHECK-NEXT: mul.2s v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i32, ptr %sp0, i64 1
%pix_sp0.0.copyload = load i32, ptr %addr, align 1
%vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <2 x i32> %vec, %vec
ret <2 x i32> %vmull.i
}
define <4 x i32> @fct21(ptr nocapture %sp0) {
; CHECK-LABEL: fct21:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr s0, [x0, #4]
; CHECK-NEXT: mul.4s v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i32, ptr %sp0, i64 1
%pix_sp0.0.copyload = load i32, ptr %addr, align 1
%vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <4 x i32> %vec, %vec
ret <4 x i32> %vmull.i
}
define <1 x i64> @fct22(ptr nocapture %sp0) {
; CHECK-LABEL: fct22:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr d0, [x0, #8]
; CHECK-NEXT: ret
entry:
%addr = getelementptr i64, ptr %sp0, i64 1
%pix_sp0.0.copyload = load i64, ptr %addr, align 1
%vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
ret <1 x i64> %vec
}
define <2 x i64> @fct23(ptr nocapture %sp0) {
; CHECK-LABEL: fct23:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr d0, [x0, #8]
; CHECK-NEXT: ret
entry:
%addr = getelementptr i64, ptr %sp0, i64 1
%pix_sp0.0.copyload = load i64, ptr %addr, align 1
%vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
ret <2 x i64> %vec
}
;
; Single loads with register offset.
define <8 x i8> @fct24(ptr nocapture %sp0, i64 %offset) {
; CHECK-LABEL: fct24:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr b0, [x0, x1]
; CHECK-NEXT: mul.8b v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i8, ptr %sp0, i64 %offset
%pix_sp0.0.copyload = load i8, ptr %addr, align 1
%vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <8 x i8> %vec, %vec
ret <8 x i8> %vmull.i
}
define <16 x i8> @fct25(ptr nocapture %sp0, i64 %offset) {
; CHECK-LABEL: fct25:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr b0, [x0, x1]
; CHECK-NEXT: mul.16b v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i8, ptr %sp0, i64 %offset
%pix_sp0.0.copyload = load i8, ptr %addr, align 1
%vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <16 x i8> %vec, %vec
ret <16 x i8> %vmull.i
}
define <4 x i16> @fct26(ptr nocapture %sp0, i64 %offset) {
; CHECK-LABEL: fct26:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
; CHECK-NEXT: mul.4h v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i16, ptr %sp0, i64 %offset
%pix_sp0.0.copyload = load i16, ptr %addr, align 1
%vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <4 x i16> %vec, %vec
ret <4 x i16> %vmull.i
}
define <8 x i16> @fct27(ptr nocapture %sp0, i64 %offset) {
; CHECK-LABEL: fct27:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
; CHECK-NEXT: mul.8h v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i16, ptr %sp0, i64 %offset
%pix_sp0.0.copyload = load i16, ptr %addr, align 1
%vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <8 x i16> %vec, %vec
ret <8 x i16> %vmull.i
}
define <2 x i32> @fct28(ptr nocapture %sp0, i64 %offset) {
; CHECK-LABEL: fct28:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr s0, [x0, x1, lsl #2]
; CHECK-NEXT: mul.2s v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i32, ptr %sp0, i64 %offset
%pix_sp0.0.copyload = load i32, ptr %addr, align 1
%vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <2 x i32> %vec, %vec
ret <2 x i32> %vmull.i
}
define <4 x i32> @fct29(ptr nocapture %sp0, i64 %offset) {
; CHECK-LABEL: fct29:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr s0, [x0, x1, lsl #2]
; CHECK-NEXT: mul.4s v0, v0, v0
; CHECK-NEXT: ret
entry:
%addr = getelementptr i32, ptr %sp0, i64 %offset
%pix_sp0.0.copyload = load i32, ptr %addr, align 1
%vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
%vmull.i = mul <4 x i32> %vec, %vec
ret <4 x i32> %vmull.i
}
define <1 x i64> @fct30(ptr nocapture %sp0, i64 %offset) {
; CHECK-LABEL: fct30:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
; CHECK-NEXT: ret
entry:
%addr = getelementptr i64, ptr %sp0, i64 %offset
%pix_sp0.0.copyload = load i64, ptr %addr, align 1
%vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
ret <1 x i64> %vec
}
define <2 x i64> @fct31(ptr nocapture %sp0, i64 %offset) {
; CHECK-LABEL: fct31:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr d0, [x0, x1, lsl #3]
; CHECK-NEXT: ret
entry:
%addr = getelementptr i64, ptr %sp0, i64 %offset
%pix_sp0.0.copyload = load i64, ptr %addr, align 1
%vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
ret <2 x i64> %vec
}