llvm-project/clang/test/CodeGen/arm-neon-endianness.c
Oliver Stannard a619a2e53a
[ARM] Fix lane ordering for AdvSIMD intrinsics on big-endian targets (#127068)
In arm-neon.h, we insert shufflevectors around each intrinsic when the
target is big-endian, to compensate for the difference between the
ABI-defined memory format of vectors (with the whole vector stored as
one big-endian access) and LLVM's target-independent expectations (with
the lowest-numbered lane in the lowest address). However, this code was
written for the AArch64 ABI, and the AArch32 ABI differs slightly: it
requires that vectors are stored in memory as-if stored with VSTM, which
does a series of 64-bit accesses, instead of the AArch64 VSTR, which
does a single 128-bit access. This means that for AArch32 we need to
reverse the lanes in each 64-bit chunk of the vector, instead of in the
whole vector.

Since there are only a small number of different shufflevector orderings
needed, I've split them out into macros, so that this doesn't need
separate conditions in each intrinsic definition.
2025-03-04 08:10:22 +00:00

116 lines
4.9 KiB
C

// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
// REQUIRES: arm-registered-target
// RUN: %clang_cc1 -triple armv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \
// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=LE
// RUN: %clang_cc1 -triple armebv8a-arm-none-eabihf -target-cpu generic -emit-llvm -o - %s -disable-O0-optnone | \
// RUN: opt -S -passes=instcombine -o - | FileCheck %s --check-prefix=BE
#include <arm_neon.h>
// LE-LABEL: define dso_local i32 @int32x4_t_lane_0(
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x4_t_lane_0(
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x4_t_lane_0(int32x4_t a) { return vgetq_lane_s32(a, 0); }
// LE-LABEL: define dso_local i32 @int32x4_t_lane_1(
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 1
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x4_t_lane_1(
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 0
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x4_t_lane_1(int32x4_t a) { return vgetq_lane_s32(a, 1); }
// LE-LABEL: define dso_local i32 @int32x4_t_lane_2(
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x4_t_lane_2(
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x4_t_lane_2(int32x4_t a) { return vgetq_lane_s32(a, 2); }
// LE-LABEL: define dso_local i32 @int32x4_t_lane_3(
// LE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x4_t_lane_3(
// BE-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 2
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x4_t_lane_3(int32x4_t a) { return vgetq_lane_s32(a, 3); }
// LE-LABEL: define dso_local i32 @int32x2_t_lane_0(
// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x2_t_lane_0(
// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x2_t_lane_0(int32x2_t a) { return vget_lane_s32(a, 0); }
// LE-LABEL: define dso_local i32 @int32x2_t_lane_1(
// LE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1
// LE-NEXT: ret i32 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i32 @int32x2_t_lane_1(
// BE-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 0
// BE-NEXT: ret i32 [[VGET_LANE]]
//
int int32x2_t_lane_1(int32x2_t a) { return vget_lane_s32(a, 1); }
// LE-LABEL: define dso_local i64 @int64x2_t_lane_0(
// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0
// LE-NEXT: ret i64 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i64 @int64x2_t_lane_0(
// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 0
// BE-NEXT: ret i64 [[VGET_LANE]]
//
int64_t int64x2_t_lane_0(int64x2_t a) { return vgetq_lane_s64(a, 0); }
// LE-LABEL: define dso_local i64 @int64x2_t_lane_1(
// LE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
// LE-NEXT: [[ENTRY:.*:]]
// LE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
// LE-NEXT: ret i64 [[VGET_LANE]]
//
// BE-LABEL: define dso_local i64 @int64x2_t_lane_1(
// BE-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
// BE-NEXT: [[ENTRY:.*:]]
// BE-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1
// BE-NEXT: ret i64 [[VGET_LANE]]
//
int64_t int64x2_t_lane_1(int64x2_t a) { return vgetq_lane_s64(a, 1); }