llvm-project/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll
Manos Anagnostakis 008f26b12e
[AArch64] New subtarget features to control ldp and stp formation (#66098)
On some AArch64 cores, including Ampere's ampere1 and ampere1a
architectures, load and store pair instructions are faster compared to
simple loads/stores only when the alignment of the pair is at least
twice that of the individual element being loaded.

Based on that, this patch introduces four new subtarget features, two
for controlling ldp and two for controlling stp, to cover the ampere1
and ampere1a alignment needs and to enable optional fine-grained control
over ldp and stp generation in general. The latter can be utilized by
another cpu, if there are possible benefits
with a different policy than the default provided by the compiler.

More specifically, for each of the ldp and stp respectively we have:

- disable-ldp/disable-stp: Do not emit ldp/stp.
- ldp-aligned-only/stp-aligned-only: Emit ldp/stp only if the source
pointer is aligned to at least double the alignment of the type.

Therefore, for -mcpu=ampere1 and -mcpu=ampere1a
ldp-aligned-only/stp-aligned-only become the defaults, because of the
benefit from the alignment, whereas for the rest of the cpus the default
behaviour of the compiler is maintained.
2023-09-14 16:58:39 +02:00

390 lines
13 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a | FileCheck %s --check-prefixes=CHECK
; RUN: llc < %s -O2 -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-DEFAULT
; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 -mattr=+disable-ldp | FileCheck %s --check-prefixes=CHECK-DISABLE-LDP
; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 -mattr=+disable-stp | FileCheck %s --check-prefixes=CHECK-DISABLE-STP
; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a -mattr=+disable-ldp | FileCheck %s --check-prefixes=CHECK-DISABLE-LDP
; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a -mattr=+disable-stp | FileCheck %s --check-prefixes=CHECK-DISABLE-STP
define i32 @ldp_aligned_int32_t(ptr %0) #0 {
; CHECK-LABEL: ldp_aligned_int32_t:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0
; CHECK-NEXT: ldp w9, w8, [x8]
; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: ldp_aligned_int32_t:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffffc0
; CHECK-DEFAULT-NEXT: ldp w9, w8, [x8]
; CHECK-DEFAULT-NEXT: add w0, w8, w9
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-LDP-LABEL: ldp_aligned_int32_t:
; CHECK-DISABLE-LDP: // %bb.0:
; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffffc0
; CHECK-DISABLE-LDP-NEXT: ldr w9, [x8]
; CHECK-DISABLE-LDP-NEXT: ldr w8, [x8, #4]
; CHECK-DISABLE-LDP-NEXT: add w0, w8, w9
; CHECK-DISABLE-LDP-NEXT: ret
%2 = ptrtoint ptr %0 to i64
%3 = and i64 %2, -64
%4 = inttoptr i64 %3 to ptr
%5 = load i32, ptr %4, align 64
%6 = getelementptr inbounds i32, ptr %4, i64 1
%7 = load i32, ptr %6, align 4
%8 = add nsw i32 %7, %5
ret i32 %8
}
define i64 @ldp_aligned_int64_t(ptr %0) #0 {
; CHECK-LABEL: ldp_aligned_int64_t:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0xffffffffffffff80
; CHECK-NEXT: ldp x9, x8, [x8]
; CHECK-NEXT: add x0, x8, x9
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: ldp_aligned_int64_t:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff80
; CHECK-DEFAULT-NEXT: ldp x9, x8, [x8]
; CHECK-DEFAULT-NEXT: add x0, x8, x9
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-LDP-LABEL: ldp_aligned_int64_t:
; CHECK-DISABLE-LDP: // %bb.0:
; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffff80
; CHECK-DISABLE-LDP-NEXT: ldr x9, [x8]
; CHECK-DISABLE-LDP-NEXT: ldr x8, [x8, #8]
; CHECK-DISABLE-LDP-NEXT: add x0, x8, x9
; CHECK-DISABLE-LDP-NEXT: ret
%2 = ptrtoint ptr %0 to i64
%3 = and i64 %2, -128
%4 = inttoptr i64 %3 to ptr
%5 = load i64, ptr %4, align 128
%6 = getelementptr inbounds i64, ptr %4, i64 1
%7 = load i64, ptr %6, align 8
%8 = add nsw i64 %7, %5
ret i64 %8
}
define <4 x i32> @ldp_aligned_v4si(ptr %0) #0 {
; CHECK-LABEL: ldp_aligned_v4si:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0xffffffffffffff00
; CHECK-NEXT: ldp q0, q1, [x8]
; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: ldp_aligned_v4si:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff00
; CHECK-DEFAULT-NEXT: ldp q0, q1, [x8]
; CHECK-DEFAULT-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-LDP-LABEL: ldp_aligned_v4si:
; CHECK-DISABLE-LDP: // %bb.0:
; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffff00
; CHECK-DISABLE-LDP-NEXT: ldr q0, [x8]
; CHECK-DISABLE-LDP-NEXT: ldr q1, [x8, #16]
; CHECK-DISABLE-LDP-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-DISABLE-LDP-NEXT: ret
%2 = ptrtoint ptr %0 to i64
%3 = and i64 %2, -256
%4 = inttoptr i64 %3 to ptr
%5 = load <4 x i32>, ptr %4, align 256
%6 = getelementptr inbounds <4 x i32>, ptr %4, i64 1
%7 = load <4 x i32>, ptr %6, align 16
%8 = add <4 x i32> %7, %5
ret <4 x i32> %8
}
define i32 @ldp_unaligned_int32_t(ptr %0) #0 {
; CHECK-LABEL: ldp_unaligned_int32_t:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0
; CHECK-NEXT: ldr w9, [x8, #4]
; CHECK-NEXT: ldr w8, [x8, #8]
; CHECK-NEXT: add w0, w8, w9
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: ldp_unaligned_int32_t:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffffc0
; CHECK-DEFAULT-NEXT: ldp w9, w8, [x8, #4]
; CHECK-DEFAULT-NEXT: add w0, w8, w9
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-LDP-LABEL: ldp_unaligned_int32_t:
; CHECK-DISABLE-LDP: // %bb.0:
; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffffc0
; CHECK-DISABLE-LDP-NEXT: ldr w9, [x8, #4]
; CHECK-DISABLE-LDP-NEXT: ldr w8, [x8, #8]
; CHECK-DISABLE-LDP-NEXT: add w0, w8, w9
; CHECK-DISABLE-LDP-NEXT: ret
%2 = ptrtoint ptr %0 to i64
%3 = and i64 %2, -64
%4 = inttoptr i64 %3 to ptr
%5 = getelementptr inbounds i32, ptr %4, i64 1
%6 = load i32, ptr %5, align 4
%7 = getelementptr inbounds i32, ptr %4, i64 2
%8 = load i32, ptr %7, align 8
%9 = add nsw i32 %8, %6
ret i32 %9
}
define i64 @ldp_unaligned_int64_t(ptr %0) #0 {
; CHECK-LABEL: ldp_unaligned_int64_t:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0xffffffffffffff80
; CHECK-NEXT: ldr x9, [x8, #8]
; CHECK-NEXT: ldr x8, [x8, #16]
; CHECK-NEXT: add x0, x8, x9
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: ldp_unaligned_int64_t:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff80
; CHECK-DEFAULT-NEXT: ldp x9, x8, [x8, #8]
; CHECK-DEFAULT-NEXT: add x0, x8, x9
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-LDP-LABEL: ldp_unaligned_int64_t:
; CHECK-DISABLE-LDP: // %bb.0:
; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffff80
; CHECK-DISABLE-LDP-NEXT: ldr x9, [x8, #8]
; CHECK-DISABLE-LDP-NEXT: ldr x8, [x8, #16]
; CHECK-DISABLE-LDP-NEXT: add x0, x8, x9
; CHECK-DISABLE-LDP-NEXT: ret
%2 = ptrtoint ptr %0 to i64
%3 = and i64 %2, -128
%4 = inttoptr i64 %3 to ptr
%5 = getelementptr inbounds i64, ptr %4, i64 1
%6 = load i64, ptr %5, align 8
%7 = getelementptr inbounds i64, ptr %4, i64 2
%8 = load i64, ptr %7, align 16
%9 = add nsw i64 %8, %6
ret i64 %9
}
define <4 x i32> @ldp_unaligned_v4si(ptr %0) #0 {
; CHECK-LABEL: ldp_unaligned_v4si:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0xffffffffffffff00
; CHECK-NEXT: ldr q0, [x8, #16]
; CHECK-NEXT: ldr q1, [x8, #32]
; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: ldp_unaligned_v4si:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff00
; CHECK-DEFAULT-NEXT: ldp q0, q1, [x8, #16]
; CHECK-DEFAULT-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-LDP-LABEL: ldp_unaligned_v4si:
; CHECK-DISABLE-LDP: // %bb.0:
; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffff00
; CHECK-DISABLE-LDP-NEXT: ldr q0, [x8, #16]
; CHECK-DISABLE-LDP-NEXT: ldr q1, [x8, #32]
; CHECK-DISABLE-LDP-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-DISABLE-LDP-NEXT: ret
%2 = ptrtoint ptr %0 to i64
%3 = and i64 %2, -256
%4 = inttoptr i64 %3 to ptr
%5 = getelementptr inbounds <4 x i32>, ptr %4, i64 1
%6 = load <4 x i32>, ptr %5, align 16
%7 = getelementptr inbounds <4 x i32>, ptr %4, i64 2
%8 = load <4 x i32>, ptr %7, align 32
%9 = add <4 x i32> %8, %6
ret <4 x i32> %9
}
define ptr @stp_aligned_int32_t(ptr %0, i32 %1) #0 {
; CHECK-LABEL: stp_aligned_int32_t:
; CHECK: // %bb.0:
; CHECK-NEXT: and x0, x0, #0xffffffffffffffc0
; CHECK-NEXT: stp w1, w1, [x0]
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: stp_aligned_int32_t:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x0, x0, #0xffffffffffffffc0
; CHECK-DEFAULT-NEXT: stp w1, w1, [x0]
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-STP-LABEL: stp_aligned_int32_t:
; CHECK-DISABLE-STP: // %bb.0:
; CHECK-DISABLE-STP-NEXT: and x0, x0, #0xffffffffffffffc0
; CHECK-DISABLE-STP-NEXT: str w1, [x0]
; CHECK-DISABLE-STP-NEXT: str w1, [x0, #4]
; CHECK-DISABLE-STP-NEXT: ret
%3 = ptrtoint ptr %0 to i64
%4 = and i64 %3, -64
%5 = inttoptr i64 %4 to ptr
store i32 %1, ptr %5, align 64
%6 = getelementptr inbounds i32, ptr %5, i64 1
store i32 %1, ptr %6, align 4
ret ptr %5
}
define dso_local ptr @stp_aligned_int64_t(ptr %0, i64 %1) #0 {
; CHECK-LABEL: stp_aligned_int64_t:
; CHECK: // %bb.0:
; CHECK-NEXT: and x0, x0, #0xffffffffffffff80
; CHECK-NEXT: stp x1, x1, [x0]
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: stp_aligned_int64_t:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x0, x0, #0xffffffffffffff80
; CHECK-DEFAULT-NEXT: stp x1, x1, [x0]
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-STP-LABEL: stp_aligned_int64_t:
; CHECK-DISABLE-STP: // %bb.0:
; CHECK-DISABLE-STP-NEXT: and x0, x0, #0xffffffffffffff80
; CHECK-DISABLE-STP-NEXT: str x1, [x0]
; CHECK-DISABLE-STP-NEXT: str x1, [x0, #8]
; CHECK-DISABLE-STP-NEXT: ret
%3 = ptrtoint ptr %0 to i64
%4 = and i64 %3, -128
%5 = inttoptr i64 %4 to ptr
store i64 %1, ptr %5, align 128
%6 = getelementptr inbounds i64, ptr %5, i64 1
store i64 %1, ptr %6, align 8
ret ptr %5
}
define ptr @stp_aligned_v4si(ptr %0, <4 x i32> %1) #0 {
; CHECK-LABEL: stp_aligned_v4si:
; CHECK: // %bb.0:
; CHECK-NEXT: and x0, x0, #0xffffffffffffff00
; CHECK-NEXT: stp q0, q0, [x0]
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: stp_aligned_v4si:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x0, x0, #0xffffffffffffff00
; CHECK-DEFAULT-NEXT: stp q0, q0, [x0]
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-STP-LABEL: stp_aligned_v4si:
; CHECK-DISABLE-STP: // %bb.0:
; CHECK-DISABLE-STP-NEXT: and x0, x0, #0xffffffffffffff00
; CHECK-DISABLE-STP-NEXT: str q0, [x0]
; CHECK-DISABLE-STP-NEXT: str q0, [x0, #16]
; CHECK-DISABLE-STP-NEXT: ret
%3 = ptrtoint ptr %0 to i64
%4 = and i64 %3, -256
%5 = inttoptr i64 %4 to ptr
store <4 x i32> %1, ptr %5, align 256
%6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
store <4 x i32> %1, ptr %6, align 16
ret ptr %5
}
define ptr @stp_unaligned_int32_t(ptr %0, i32 %1) #0 {
; CHECK-LABEL: stp_unaligned_int32_t:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0
; CHECK-NEXT: orr x0, x8, #0x4
; CHECK-NEXT: str w1, [x8, #4]
; CHECK-NEXT: str w1, [x8, #8]
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: stp_unaligned_int32_t:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffffc0
; CHECK-DEFAULT-NEXT: orr x0, x8, #0x4
; CHECK-DEFAULT-NEXT: stp w1, w1, [x8, #4]
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-STP-LABEL: stp_unaligned_int32_t:
; CHECK-DISABLE-STP: // %bb.0:
; CHECK-DISABLE-STP-NEXT: and x8, x0, #0xffffffffffffffc0
; CHECK-DISABLE-STP-NEXT: orr x0, x8, #0x4
; CHECK-DISABLE-STP-NEXT: str w1, [x8, #4]
; CHECK-DISABLE-STP-NEXT: str w1, [x8, #8]
; CHECK-DISABLE-STP-NEXT: ret
%3 = ptrtoint ptr %0 to i64
%4 = and i64 %3, -64
%5 = inttoptr i64 %4 to ptr
%6 = getelementptr inbounds i32, ptr %5, i64 1
store i32 %1, ptr %6, align 4
%7 = getelementptr inbounds i32, ptr %5, i64 2
store i32 %1, ptr %7, align 8
ret ptr %6
}
define ptr @stp_unaligned_int64_t(ptr %0, i64 %1) #0 {
; CHECK-LABEL: stp_unaligned_int64_t:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0xffffffffffffff80
; CHECK-NEXT: orr x0, x8, #0x8
; CHECK-NEXT: str x1, [x8, #8]
; CHECK-NEXT: str x1, [x8, #16]
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: stp_unaligned_int64_t:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff80
; CHECK-DEFAULT-NEXT: orr x0, x8, #0x8
; CHECK-DEFAULT-NEXT: stp x1, x1, [x8, #8]
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-STP-LABEL: stp_unaligned_int64_t:
; CHECK-DISABLE-STP: // %bb.0:
; CHECK-DISABLE-STP-NEXT: and x8, x0, #0xffffffffffffff80
; CHECK-DISABLE-STP-NEXT: orr x0, x8, #0x8
; CHECK-DISABLE-STP-NEXT: str x1, [x8, #8]
; CHECK-DISABLE-STP-NEXT: str x1, [x8, #16]
; CHECK-DISABLE-STP-NEXT: ret
%3 = ptrtoint ptr %0 to i64
%4 = and i64 %3, -128
%5 = inttoptr i64 %4 to ptr
%6 = getelementptr inbounds i64, ptr %5, i64 1
store i64 %1, ptr %6, align 8
%7 = getelementptr inbounds i64, ptr %5, i64 2
store i64 %1, ptr %7, align 16
ret ptr %6
}
define ptr @stp_unaligned_v4si(ptr %0, <4 x i32> %1) #0 {
; CHECK-LABEL: stp_unaligned_v4si:
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0xffffffffffffff00
; CHECK-NEXT: orr x0, x8, #0x10
; CHECK-NEXT: str q0, [x8, #16]
; CHECK-NEXT: str q0, [x8, #32]
; CHECK-NEXT: ret
;
; CHECK-DEFAULT-LABEL: stp_unaligned_v4si:
; CHECK-DEFAULT: // %bb.0:
; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff00
; CHECK-DEFAULT-NEXT: orr x0, x8, #0x10
; CHECK-DEFAULT-NEXT: stp q0, q0, [x8, #16]
; CHECK-DEFAULT-NEXT: ret
;
; CHECK-DISABLE-STP-LABEL: stp_unaligned_v4si:
; CHECK-DISABLE-STP: // %bb.0:
; CHECK-DISABLE-STP-NEXT: and x8, x0, #0xffffffffffffff00
; CHECK-DISABLE-STP-NEXT: orr x0, x8, #0x10
; CHECK-DISABLE-STP-NEXT: str q0, [x8, #16]
; CHECK-DISABLE-STP-NEXT: str q0, [x8, #32]
; CHECK-DISABLE-STP-NEXT: ret
%3 = ptrtoint ptr %0 to i64
%4 = and i64 %3, -256
%5 = inttoptr i64 %4 to ptr
%6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
store <4 x i32> %1, ptr %6, align 16
%7 = getelementptr inbounds <4 x i32>, ptr %5, i64 2
store <4 x i32> %1, ptr %7, align 32
ret ptr %6
}